In [1]:
import requests
import re
import copy
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
countries_dict = {"Bolivia": "bolivia", "Brazil Naturals" : "brazil-naturals"  , "Brazil Pulped Naturals":"brazil-pulped-naturals", "Burundi": "burundi-program", "Colombia" : "colombia-third-program" , "Colombia North": "colombia-program", "Colombia South" : "colombia-second-program",
                  "Costa Rica": "costa-rica-program", "El Salvador" : "el-salvador-program", "Guatemala" : "guatemala-program", "Honduras" : "honduras-program" , "Mexico" : "mexico-program", "Nicaragua" : "nicaragua-program", " Peru" : "peru", "Rwanda" : "rwanda-program"}

In [3]:
def get_year_spider(country):
    """
    country: which country to be crawled
    result: list of years that COE performed
    """
    result = []
    url = "https://www.allianceforcoffeeexcellence.org/en/cup-of-excellence/country-programs/" + str(country)
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    select = soup.find('select', id = 'id_competition')
    try:
        result = [year.text for year in select.find_all('option')]
    except:
        print('get_year_spider(' + country + ') : ', end = ' ')
        print('No \"option\" tag. Returning empty result list')
    return result

In [4]:
def parse_year(year_list):
    """
    year_list : input year list of strings
    result: list of years which is parsed to appropriate format
    """
    replace = {'2015, January' : '2015-01', '2015, December' : '2015-12', '2016, May' : '2016-05', '2016, June' : '2016-06'}
    replace = dict((re.escape(k) , v ) for k, v in replace.items())
    pattern = re.compile("|".join(replace.keys())) 
    year_list = [pattern.sub(lambda m: replace[re.escape(m.group(0))] , y) for y in year_list]
    return year_list

In [5]:
def get_head_spider(country, year):
    """
    country: which country to be crawled
    year: which year to be crawled
    result: list of headers that form a table
    """
    result = []
    url = "https://www.allianceforcoffeeexcellence.org/en/cup-of-excellence/country-programs/" + str(country) +"/"+ str(year)
    try:
        print('get_head_spider('+ country + ", "+ year +') : ', end = ' ')
        source_code = requests.get(url)
        print("Successful")
    except: 
        print("Failed ")
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    
    for test in soup.find_all('tr'):
        cols = test.find_all('th')
        result = [ele.text.strip() for ele in cols]
        break
    return result


In [6]:
def get_elements_spider(country, year):
    """
    country: which country to be crawled
    year: which year to be crawled
    result: list of lists which contains crawled data table of country, year pair 
    """
    result = []
    url = "https://www.allianceforcoffeeexcellence.org/en/cup-of-excellence/country-programs/" + str(country) +"/"+ str(year)
    try:
        print("get_elements_spider("+ country + ", "+ year+') : ', end = ' ')
        source_code = requests.get(url)
        print('Successful')
    except: 
        print("Failed")
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    
    for test in soup.find_all('tr'):
        temp_list = [ele.text.strip() for ele in test.find_all('td')]
        temp_list.extend([year, country])
        result.append(temp_list)
    return result[1:]

In [7]:
# Make header list and extend it
header = get_head_spider('bolivia', '2009')
header.extend(['year', 'country'])

get_head_spider(bolivia, 2009)... Successful


In [8]:
def make_dataframe():
    """
    Now make dataframe 'coe' by repeating concatenation process
    """
    coe = pd.DataFrame()
    for key, value in countries_dict.items():
        for year in parse_year(get_year_spider(value)):
            try:
                temp = get_elements_spider(value, year)
            except:
                print('Throws unexpected error...' + value+ '...'+ year)
            if len(temp) != 0:
                coe = coe.append(pd.DataFrame(data = temp,  columns = header))
                print('make_dataframe() : concatenated...'+ value +'...'+ year)
            else:
                print('make_dataframe() : No elements...'+ value + '...' + year)
    return coe

In [10]:
coe = make_dataframe()
coe.head()

get_elements_spider(bolivia, 2009) Successful
make_dataframe() : concatenated...bolivia...2009
get_elements_spider(bolivia, 2008) Successful
make_dataframe() : concatenated...bolivia...2008
get_elements_spider(bolivia, 2007) Successful
make_dataframe() : concatenated...bolivia...2007
get_elements_spider(bolivia, 2005) Successful
make_dataframe() : concatenated...bolivia...2005
get_elements_spider(bolivia, 2004) Successful
make_dataframe() : concatenated...bolivia...2004
get_elements_spider(brazil-naturals, 2017) Successful
make_dataframe() : No elements...brazil-naturals...2017
get_elements_spider(brazil-naturals, 2016) Successful
make_dataframe() : concatenated...brazil-naturals...2016
get_elements_spider(brazil-naturals, 2015-12) Successful
make_dataframe() : concatenated...brazil-naturals...2015-12
get_elements_spider(brazil-naturals, 2015-01) Successful
make_dataframe() : concatenated...brazil-naturals...2015-01
get_elements_spider(brazil-naturals, 2014) Successful
make_dataframe()

Unnamed: 0,Rank,Size,Farm / CWS,Farmer / Representative,Region,Score,year,country
0,1,24,Agrotakesi SA,Mauricio Ramiro Diez de Medina,"Yanacahi, Yungas of La Paz",93.36,2009,bolivia
1,2,28,Café Sima del Jaguar A,Braulio Luque Yana,"Caranavi, Yungas of La Paz",92.05,2009,bolivia
2,3,23,Café Monterrey,Valentin Choquehuanca Aduviri,"Caranavi, Yungas of La Paz",91.62,2009,bolivia
3,4,20,Café Jacaranda,Cruz Elias Choconapi,"Caranavi, Yungas of La Paz",91.38,2009,bolivia
4,5,21,Café Alan Coffe,Luis Yujra Arismende,"Caranavi, Yungas of La Paz",90.9,2009,bolivia
