# Coffee Data Web Crawling Code
### Park Gyeol

In [1]:
# Library Declaration Part

from bs4 import BeautifulSoup
import pandas as pd
import urllib
import requests

In [2]:
# Pandas DataFrame Initialization and Search Filter Declaration

coffee_data = pd.DataFrame(
    columns = ['GROWING_REGION', 'TREE_VARIETY', 'COFFEE_ORIGIN', 'ROASTER_LOCATION',
              'COFFEE_ORIGIN', 'ROAST_LEVEL', 'AGTRON', 'REVIEW_DATE',
              'AROMA', 'ACIDITY/STRUCTURE', 'BODY', 'FLAVOR', 'AFTERTASTE']
)

Growing_Regions = ['region_africa_arabia', 'region_caribbean', 'region_central_america',
                   'region_hawaii', 'region_asia_pacific', 'region_south_america']

Tree_Varieties = ['tree_variety_geisha', 'tree_variety_bourbon', 'tree_variety_catuai', 'tree_variety_caturra',
                  'tree_variety_maragogipe', 'tree_variety_maracaturra', 'tree_variety_mocca-moka', 'tree_variety_pacamara',
                 'tree_variety_robusta', 'tree_variety_sl-28-sl-34', 'tree_variety_typica']

In [3]:
# Web Crawling Processing

site_full_url = "https://www.coffeereview.com/advanced-search/?keyword=&search=Search+Now&locations=all&results=all&"

index = 0

for Growing_Region in Growing_Regions:
    for Tree_Variety in Tree_Varieties:
        site_url = site_full_url + Growing_Region + "=on&" + Tree_Variety + "=on"
        while True:
            response = requests.get(site_url, headers={"User-Agent": "Mozilla/5.0"})
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            content_ancestor = soup.find(attrs={'id':'genesis-content'})
            content_class_item = content_ancestor.find_all(attrs={'class':'entry-content'})
            
            for lst in content_class_item:
                url = lst.select('div > div > div.col-2 > p > a')[0]['href']
                response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
                html = response.text
                content_per_page = BeautifulSoup(html, 'html.parser')

                my_content = content_per_page.find(attrs={'class':'review-template'})
                tables = my_content.find_all(attrs={'class':'review-template-table'})
                tables_left = tables[0].select('tr > td')
                tables_right = tables[1].select('tr > td')

                table_left_index = 0
                table_right_index = 0

                new_element = {'GROWING_REGION':Growing_Region, 'TREE_VARIETY':Tree_Variety, 'COFFEE_ORIGIN':None, 
                               'ROASTER_LOCATION':None, 'COFFEE_ORIGIN':None, 'ROAST_LEVEL':None, 'AGTRON':None,
                               'REVIEW_DATE':None,'AROMA':None, 'ACIDITY/STRUCTURE':None, 'BODY':None, 
                               'FLAVOR':None, 'AFTERTASTE':None}
                new_elemnet_keys = new_element.keys()
                new_element['COFFEE_ORIGIN'] = my_content.find(attrs={'class':'review-title'}).getText()

                while True:
                    if table_left_index >= len(tables_left):
                        break
                    column_name = tables_left[table_left_index].getText()[:-1].replace(" ", "_").upper()
                    column_element = tables_left[table_left_index + 1].getText()
                    if column_name in new_elemnet_keys:
                        new_element[column_name] = column_element
                    table_left_index = table_left_index + 2


                while True:
                    if table_right_index >= len(tables_right):
                        break
                    column_name = tables_right[table_right_index].getText()[:-1].replace(" ", "_").upper()
                    column_element = tables_right[table_right_index + 1].getText().strip()
                    if column_name in new_elemnet_keys:
                        new_element[column_name] = column_element
                    table_right_index = table_right_index + 2

                coffee_data = coffee_data.append(new_element, ignore_index=True)
                index = index + 1
                
            next_page_element = soup.find(attrs={'class':'pagination-next'})
            if next_page_element is None:
                break

            next_page = next_page_element.select('a')[0]['href']
            site_url = next_page


In [4]:
# Check the Coffee Bean Review Data through Pandas Dataframe

coffee_data

Unnamed: 0,GROWING_REGION,TREE_VARIETY,COFFEE_ORIGIN,ROASTER_LOCATION,COFFEE_ORIGIN.1,ROAST_LEVEL,AGTRON,REVIEW_DATE,AROMA,ACIDITY/STRUCTURE,BODY,FLAVOR,AFTERTASTE
0,region_africa_arabia,tree_variety_geisha,"Guji Zone, Oromia region, southern Ethiopia","Denver, Colorado","Guji Zone, Oromia region, southern Ethiopia",Medium-Light,60/78,January 2022,9,8,9,9,9
1,region_africa_arabia,tree_variety_geisha,"Bench-Maji Zone, Southern Ethiopia","Taoyuan, Taiwan","Bench-Maji Zone, Southern Ethiopia",Light,62/80,December 2021,9,9,9,9,8
2,region_africa_arabia,tree_variety_geisha,"Oromia growing region, southern Ethiopia","Taipei, Taiwan","Oromia growing region, southern Ethiopia",Light,64/80,November 2021,9,8,9,9,8
3,region_africa_arabia,tree_variety_geisha,"Ngorongoro, Tanzania","Minneapolis, Minnesota","Ngorongoro, Tanzania",Light,62/80,August 2021,9,9,9,9,9
4,region_africa_arabia,tree_variety_geisha,"Bench-Maji Zone, southern Ethiopia","Hong Kong, China","Bench-Maji Zone, southern Ethiopia",Medium-Light,58/74,May 2020,9,9,9,9,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,region_south_america,tree_variety_typica,"Loja Province, southern Ecuador","Branford, Connecticut","Loja Province, southern Ecuador",Medium-Light,54/78,March 2019,9,8,8,9,8
1123,region_south_america,tree_variety_typica,"Pichincha Province, north-central Ecuador","Minneapolis, MInnesota","Pichincha Province, north-central Ecuador",Medium-Light,55/73,March 2019,9,9,8,9,8
1124,region_south_america,tree_variety_typica,"Cajamarca region, northern Peru","Chino, California","Cajamarca region, northern Peru",Medium-Light,57/78,August 2018,9,8,9,9,7
1125,region_south_america,tree_variety_typica,"Pichincha growing region, northern Ecuador","Ramsey, Minnesota","Pichincha growing region, northern Ecuador",Medium-Light,55/77,April 2018,9,9,8,9,8


In [5]:
# Save DataFrame As Excel File

file_name = 'CoffeeBeanReviewData.xlsx'
  
coffee_data.to_excel(file_name)

print('Coffee Bean Review Data successfully exported into Excel File')

Coffee Bean Review Data successfully exported into Excel File
