In [1]:
# Normal Imports
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# Additional Imports Needed
from pyquery import PyQuery as pq
import requests

In [94]:
# Query the online database for the 'basic info'
basicinfo=requests.get("http://www.irgcis.irri.org:81/grc/TK.exe$Query?DataSource=IRG&GBUSER_TK_PASS1_ORICOUNTRY.STATUS_ACC-OP=%3D&GBUSER_TK_PASS1_ORICOUNTRY.STATUS_ACC=&GBUSER_TK_PASS1_ORICOUNTRY.ACCNO-OP=%3E%3D&GBUSER_TK_PASS1_ORICOUNTRY.ACCNO=&GBUSER_TK_PASS1_ORICOUNTRY.ACCNO-OP=%3C%3D&GBUSER_TK_PASS1_ORICOUNTRY.ACCNO=&GBUSER_TK_PASS1_ORICOUNTRY.SPECIES_REID-OP=%3D&GBUSER_TK_PASS1_ORICOUNTRY.SPECIES_REID=&GBUSER_TK_PASS1_ORICOUNTRY.ALL_ACCNO_NAME-OP=ctn&GBUSER_TK_PASS1_ORICOUNTRY.ALL_ACCNO_NAME=&GBUSER_TK_PASS1_ORICOUNTRY.ORI_COUNTRY-OP=%3D&GBUSER_TK_PASS1_ORICOUNTRY.ORI_COUNTRY=&GBUSER_TK_PASS1_SSCOUNTRY.SS_COUNTRY-OP=%3D&GBUSER_TK_PASS1_SSCOUNTRY.SS_COUNTRY=&GBUSER_TK_PASS1_ORICOUNTRY.CULT_TYPE-OP=%3D&GBUSER_TK_PASS1_ORICOUNTRY.CULT_TYPE=&GBUSER_TK_MORPH1_2.MAT-OP=%3E%3D&GBUSER_TK_MORPH1_2.MAT=&GBUSER_TK_MORPH1_2.MAT-OP=%3C%3D&GBUSER_TK_MORPH1_2.MAT=&GBUSER_TK_MORPH1_2.GRLT-OP=%3E%3D&GBUSER_TK_MORPH1_2.GRLT=&GBUSER_TK_MORPH1_2.GRLT-OP=%3C%3D&GBUSER_TK_MORPH1_2.GRLT=&GBUSER_TK_MORPH1_2.GRWD-OP=%3E%3D&GBUSER_TK_MORPH1_2.GRWD=&GBUSER_TK_MORPH1_2.GRWD-OP=%3C%3D&GBUSER_TK_MORPH1_2.GRWD=&GBUSER_TK_MORPH1_2.VG-OP=%3D&GBUSER_TK_MORPH1_2.VG=&GBUSER_TK_MORPH1_2.ENDO-OP=%3D&GBUSER_TK_MORPH1_2.ENDO=&GBUSER_TK_MORPH1_2.SCCO_REV-OP=%3D&GBUSER_TK_MORPH1_2.SCCO_REV=&GBUSER_TK_EVAL.BL_DESCRIPTION-OP=ctn&GBUSER_TK_EVAL.BL_DESCRIPTION=&GBUSER_TK_EVAL.BB_DESCRIPTION-OP=ctn&GBUSER_TK_EVAL.BB_DESCRIPTION=&GBUSER_TK_EVAL.SHB_DESCRIPTION-OP=ctn&GBUSER_TK_EVAL.SHB_DESCRIPTION=&Output=%2FGRC%2FAccessionID.htm&Limit=-1")

In [96]:
# Clean page text to find only the rows corresponding to be data on rice (poor HTML formatting)
d_= pq(basicinfo.text)
d_rows = pq(d_('tr')[4:])
d_rows = pq(d_rows[:(len(d_rows)-1)])

In [97]:
# Start by creating an empty list.
ricestrains=[]
fields=['strain_id', 'species_name', 'variety_name', 'previous_name', 
        'pedigree', 'collection_number', 'acc_id_seq_num', 'acc_id_seed_donor_number',
        'source_country', 'donor_country', 'acc_date', 'status', 'cultural_type',
        'special_traits', 'fao_in_trust', 'multilateral_system']

# Iterate over the elements of d_rows. In this case "r" will
# receive each value from "d_rows" in turn.
for r in d_rows:
    # Extract the "td" element from the current value of r.
    d_td=pq(r)('td')
    
    strain_id =  int(pq(d_td[0]).text())
    species_name = pq(d_td[1]).text()
    variety_name = pq(d_td[2]).text()
    previous_name = pq(d_td[3]).text()
    pedigree = pq(d_td[4]).text()
    collection_num = pq(d_td[5]).text()
    acc_id_seq_num = pq(d_td[6]).text()
    acc_id_seed_don_num = pq(d_td[7]).text()
    source_country = pq(d_td[8]).text()
    donor_country = pq(d_td[9]).text()
    acc_date = pq(d_td[10]).text()
    status = pq(d_td[11]).text()
    cultural_type = pq(d_td[12]).text()
    special_traits = pq(d_td[13]).text()
    fao_in_trust = pq(d_td[14]).text()
    multi_later_sys = pq(d_td[15]).text()
    
    a = [strain_id, species_name, variety_name, previous_name,
         pedigree, collection_num, acc_id_seq_num, acc_id_seed_don_num,
         source_country, donor_country, acc_date, status, cultural_type,
         special_traits, fao_in_trust, multi_later_sys]
    
    ricedict = dict(zip(fields,a))
    
    ricestrains.append(ricedict)

In [98]:
len(ricestrains)

131112

In [107]:
tempdf=pd.DataFrame(ricestrains)
#tempdf[:35]
tempdf.to_pickle("firstpass")

In [100]:
# FUN SOUND BIT
from IPython.display import Audio
sound_file = 'http://sfxcontent.s3.amazonaws.com/soundfx/EmergencyAlertSystemBeep.mp3'
Audio(url=sound_file, autoplay=True)

In [83]:
# # We'll just reuse the request object that was previously created to create a BeautifulSoup element.
# # The latter will be the equivalent of the "d_" object we created before.
# soup = BeautifulSoup(take1.text, "html.parser")

# # # In this line we are looking for a single "table" element with a class of wikitable;
# # # and then looking for all the "tr" elements on that table (notice the find vs find_all calls).
# # # Even though the syntax is very different from PyQuery, the end result is similar.
# rows = soup.find("table").find_all("tr")

# # # We then define an anonymous (lambda) function whose job it is to act on
# # #each column's element in each row in the table. Lambda functions are very
# # # handy for functional programming, and the one below should be easy to follow.
# # # The function processes each field of the parameter r accordingly. It starts by
# # # transforming the first column into an integer; it then proceeds to getting the text
# # # from the second and third elements, and finally it gets the HTTP link of the third
# # # element, and returns all that in a list (notice the surrounding brackets).
# # # The function is then bound to the cleaner variable so it can be referenced later.
# # cleaner = lambda r: [int(r[0].get_text()), r[1].get_text(), r[2].get_text(), r[2].find("a").get("href")]

# # #lambda functions are also excellent for defining one line math functions.
# # #e.g. radius = lambda x,y: np.sqrt(x*x + y*y)

# # # Next we'll create a list of names that will be used as dictionary keys.
# # fields = ["ranking", "title", "band_singer", "url"]

# # # We now use the lambda function to process each "td" element on a given row.
# # # the [... for ... in ...] construct is a list comprehension. They look weird at
# # # first but are amazingly useful and worth spending some time to learn.
# # # At a high level, thing of it as a one line "for loop" that aggregates the result
# # # of each iteration into a list. So once this line finished running, we will have a list
# # # of something.
# # #
# # # The dict function is another way to create a dictionary. One neat thing about it
# # # is that it accepts a list of key/value pairs that will be used to create said dictionary.
# # #
# # # But where are these key/value pairs coming from in here? From the zip function!
# # # The zip function will take multiple iterables (things that can be treated as a sequence)
# # # and combine them. An example might make it clearer:
# # #
# # # zip(["a", "b", "c"], [1, 2, 3]) evaluates to [("a", 1), ("b", 2), ("c", 3)]. It's like a zipper!!!
# # #
# # # Anyway, never mind the parenthesis around the pairs; they just show that the elements
# # # are grouped into tuples, which you can think of as lists that are immutable (they can't grow or shrink).
# # #
# # # So to recap: the zip function creates a list of pairs; which the dict function then uses
# # # to create a dictionary, using the first element of the pair as the key and the second as
# # # the value; and finally, the list comprehension iterates over each row element, and puts
# # # the result of each iteration on a list, which is then bound to the songs variable.
# # songs = [dict(zip(fields, cleaner(row.find_all("td")))) for row in rows]

3

[{'band_singer': 'The Guess Who',
  'ranking': 3,
  'title': 'American Woman',
  'url': '/wiki/The_Guess_Who'},
 {'band_singer': 'B.J. Thomas',
  'ranking': 4,
  'title': "Raindrops Keep Fallin' on My Head",
  'url': '/wiki/B.J._Thomas'}]