In [74]:
import requests
import re
import copy
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [4]:
countries_dict = {"Bolivia": "bolivia", "Brazil Naturals" : "brazil-naturals"  , "Brazil Pulped Naturals":"brazil-pulped-naturals", "Burundi": "burundi-program", "Colombia" : "colombia-third-program" , "Colombia North": "colombia-program", "Colombia South" : "colombia-second-program", "Costa Rica": "costa-rica-program", "El Salvador" : "el-salvador-program", "Guatemala" : "guatemala-program", "Honduras" : "honduras-program" , "Mexico" : "mexico-program", "Nicaragua" : "nicaragua-program", " Peru" : "peru", "Rwanda" : "rwanda-program"}

In [5]:
def get_year_spider(country):
    """
    country: which country to be crawled
    result: list of years that COE performed
    """
    result = []
    url = "https://www.allianceforcoffeeexcellence.org/en/cup-of-excellence/country-programs/" + str(country)
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    select = soup.find('select', id = 'id_competition')
    result = [year.text for year in select.find_all('option')]
    return result

In [6]:
# Initialize Dataframe
coe = pd.DataFrame(columns = ['country', 'year'])

# Do crawler work
for key, value  in countries_dict.items():
    try:
        print("Trying to crawl country.... " + value)
        temp_year = copy.deepcopy(get_year_spider(value))
        temp_country = [key] * len(temp_year)
        temp_df = pd.DataFrame(
        {'country' : temp_country,
        'year' : temp_year
        })
        coe = coe.append(temp_df)
        print("Successful.")
    except:
        print("Failed.")

# reset_index
coe = coe.reset_index(drop = True)

Trying to crawl country.... bolivia
Successful.
Trying to crawl country.... brazil-naturals
Successful.
Trying to crawl country.... brazil-pulped-naturals
Successful.
Trying to crawl country.... burundi-program
Successful.
Trying to crawl country.... colombia-third-program
Failed.
Trying to crawl country.... colombia-program
Successful.
Trying to crawl country.... colombia-second-program
Successful.
Trying to crawl country.... costa-rica-program
Successful.
Trying to crawl country.... el-salvador-program
Successful.
Trying to crawl country.... guatemala-program
Successful.
Trying to crawl country.... honduras-program
Successful.
Trying to crawl country.... mexico-program
Successful.
Trying to crawl country.... nicaragua-program
Successful.
Trying to crawl country.... peru
Failed.
Trying to crawl country.... rwanda-program
Successful.


In [91]:
# Now control 'year' column for url format
for year in coe['year']:
    if re.findall(r',' , year):
        coe['year'] = coe['year'].replace(
            [year for year in coe['year'] if re.findall(r',' ,  year)],
            ['2015-12', '2015-01', '2016-05', '2016-06', '2016-05'])

In [97]:
# Set multi-index
coe.set_index(['country', 'year'], inplace = True)

In [8]:
def get_head_spider(country, year):
    """
    country: which country to be crawled
    year: which year to be crawled
    result: list of headers that form a table
    """
    result = []
    url = "https://www.allianceforcoffeeexcellence.org/en/cup-of-excellence/country-programs/" + str(country) +"/"+ str(year)
    try:
        print("Trying to get headers from..... "+ country + ", "+ year)
        source_code = requests.get(url)
    except: 
        print("failed to retrieve headers")
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    
    for test in soup.find_all('tr'):
        cols = test.find_all('th')
        result = [ele.text.strip() for ele in cols]
        break
    return result


In [65]:
def get_elements_spider(country, year):
    """
    country: which country to be crawled
    year: which year to be crawled
    result: list of lists which contains crawled data table of country, year pair 
    """
    result = []
    url = "https://www.allianceforcoffeeexcellence.org/en/cup-of-excellence/country-programs/" + str(country) +"/"+ str(year)
    try:
        print("Trying to get data of..... "+ country + ", "+ year)
        source_code = requests.get(url)
    except: 
        print("failed to retrieve data")
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    
    for test in soup.find_all('tr'):
        temp_list = [ele.text.strip() for ele in test.find_all('td')]
        result.append(temp_list)
    return result[1:]

In [66]:
get_elements_spider('bolivia','2009')

Trying to get data of..... bolivia, 2009


[['1',
  '24',
  'Agrotakesi SA',
  'Mauricio Ramiro Diez de Medina',
  'Yanacahi, Yungas of La Paz',
  '93.36'],
 ['2',
  '28',
  'Café Sima del Jaguar A',
  'Braulio Luque Yana',
  'Caranavi, Yungas of La Paz',
  '92.05'],
 ['3',
  '23',
  'Café Monterrey',
  'Valentin Choquehuanca Aduviri',
  'Caranavi, Yungas of La Paz',
  '91.62'],
 ['4',
  '20',
  'Café Jacaranda',
  'Cruz Elias Choconapi',
  'Caranavi, Yungas of La Paz',
  '91.38'],
 ['5',
  '21',
  'Café Alan Coffe',
  'Luis Yujra Arismende',
  'Caranavi, Yungas of La Paz',
  '90.90'],
 ['6',
  '26',
  'Café Central',
  'Luis Huayhua Chiji',
  'Caranavi, Yungas of La Paz',
  '90.71'],
 ['7',
  '23',
  'Café Mondono',
  'Carmelo Mamani Titirico',
  'Caranavi, Yungas of La Paz',
  '90.17'],
 ['8',
  '27',
  'Café Palmeiras 3',
  'Mario Mamani Machaca',
  'Caranavi, Yungas of La Paz',
  '89.12'],
 ['9',
  '29',
  'Café Origen',
  'Damian Huanca Flores',
  'Caranavi, Yungas of La Paz',
  '87.95'],
 ['10',
  '23',
  'Café Gelen',
  

In [36]:
print(get_head_spider('bolivia', '2009'))

Trying to get headers from..... bolivia, 2009
['Rank', 'Size', 'Farm / CWS', 'Farmer / Representative', 'Region', 'Score']


In [None]:
def spider(max_pages):
    page = 1
    while page < max_pages:
        url = "https://www.allianceforcoffeeexcellence.org/en/cup-of-excellence/country-programs/" + str(country) + "/" + str(year)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        
        
        