In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Initialize the list to store data
name_slug_list = []

# Set up Selenium WebDriver using webdriver_manager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Loop through the pages
for page in range(1, 115):
    # Navigate to the current page
    driver.get(f'https://www.coffeereview.com/advanced-search/page/{page}/')

    # Wait for elements to load if necessary
    driver.implicitly_wait(10)

    # Find the elements you want to scrape
    review_titles = driver.find_elements(By.CLASS_NAME, 'review-title')

    # Extract data
    for title in review_titles:
        ns_dict = {}
        ns_dict['name'] = title.text
        ns_dict['slug'] = title.find_element(By.TAG_NAME, 'a').get_attribute('href')
        name_slug_list.append(ns_dict)

    # Optional: Print progress every 25 pages
    if page % 25 == 0:
        print(f'Scraped {page} pages')

    # Add a delay to be polite and avoid overwhelming the server
    time.sleep(1)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(name_slug_list)
df.to_csv('coffee_reviews_selenium.csv', index=False)

# Close the driver
driver.quit()

print('Done scraping')



Scraped 25 pages
Scraped 50 pages
Scraped 75 pages
Scraped 100 pages
Done scraping


In [3]:
name_slug_df = pd.DataFrame(name_slug_list)

In [4]:
name_slug_df.head()

Unnamed: 0,name,slug
0,Colombia Lulo Wonka Wonka & Sebastian Gomez,https://www.coffeereview.com/review/colombia-l...
1,Colombia Finca La Reserva,https://www.coffeereview.com/review/colombia-f...
2,Ethiopia Agaro Gera Duromina #14,https://www.coffeereview.com/review/ethiopia-a...
3,Ethiopia Asikana Natural,https://www.coffeereview.com/review/ethiopia-a...
4,Mexico Montecristo Reserve,https://www.coffeereview.com/review/mexico-mon...


In [5]:
name_slug_df.to_csv('slug1.csv', index=False)

In [8]:
import requests  # Import the requests library
from bs4 import BeautifulSoup  # Import BeautifulSoup for HTML parsing
import pandas as pd  # Import pandas for data manipulation
import time  # Import time for adding delays

# Assuming name_slug_list is already defined and populated
name_slug_df = pd.DataFrame(name_slug_list)  # Convert the list to DataFrame
name_slug_df.head()  # Display the first few rows of the DataFrame
name_slug_df.to_csv('slug1.csv', index=False)  # Save the DataFrame to a CSV file

coffee_list = []  # Initialize an empty list to store coffee data
i = 0  # Initialize a counter
print('begin scraping')

# Loop through each slug in the DataFrame
for slug in name_slug_df.slug:
    i += 1
    url = slug  # Assuming slug contains the full URL, otherwise prepend base URL
    try:
        res = requests.get(url)  # Make the request to the URL
        soup = BeautifulSoup(res.content, "lxml")  # Parse the content with BeautifulSoup
        entry_content = soup.find('div', {'class': "entry-content"})  # Look for the div

        coffee_dict = {}
        coffee_dict['slug'] = slug

        # Check if entry_content exists before trying to extract text
        if entry_content:
            coffee_dict['all_text'] = entry_content.get_text()
        else:
            coffee_dict['all_text'] = 'N/A'  # Handle missing content

        # Handle the other fields similarly, checking if they exist
        rating = soup.find('span', {'class': "review-template-rating"})
        coffee_dict['rating'] = rating.text if rating else 'N/A'

        roaster = soup.find('p', {'class': "review-roaster"})
        coffee_dict['roaster'] = roaster.text if roaster else 'N/A'

        coffee_list.append(coffee_dict)  # Append the coffee data to the list

        if i % 500 == 0:
            print(f'Scraped {i} pages')

    except Exception as e:
        print(f"Error: {e} for URL: {url}")

    time.sleep(1)  # Add a delay to avoid overwhelming the server

print('done scraping')


begin scraping
Scraped 500 pages
Scraped 1000 pages
Scraped 1500 pages
Scraped 2000 pages
done scraping


In [2]:
import pandas as pd
coffee_df = pd.DataFrame(coffee_list)
coffee_df.head()

NameError: name 'coffee_list' is not defined

In [18]:
coffee_df.to_csv('coffee_df_test.csv', index=False)

In [28]:
import re
import pandas as pd

info_list = []

for all_text in coffee_df.index:
    text = coffee_df.loc[all_text].all_text

    # Updated regex patterns based on the structure of the text
    data_info = [
        re.findall(r'Roaster Location:\s*(.*)', text),
        re.findall(r'Coffee Origin:\s*(.*)', text),
        re.findall(r'Roast Level:\s*(.*)', text),
        re.findall(r'Est\.? Price:\s*(.*)', text),
        re.findall(r'Review Date:\s*(.*)', text),
        re.findall(r'Agtron:\s*(\d+/\d+)', text),  # Adjust for two-number format
        re.findall(r'Aroma:\s*(\d+)', text),
        re.findall(r'Acid.*?Structure:\s*(\d+)', text),  # Adjust for "Acidity/Structure"
        re.findall(r'Body:\s*(\d+)', text),
        re.findall(r'Flavor:\s*(\d+)', text),
        re.findall(r'Aftertaste:\s*(\d+)', text),
        re.findall(r'With Milk:\s*(\d+)?', text),
        re.findall(r'Blind Assessment\n(.*)', text),  # Captures everything after "Blind Assessment"
        re.findall(r'Notes\n(.*)', text),  # Captures everything after "Notes"
        re.findall(r'Bottom Line\s*(.*)', text)  # Captures everything after "Bottom Line"
    ]
    
    # Extract the first element or assign None if list is empty
    data_info = [item[0] if item else None for item in data_info]
    
    info_list.append(data_info)
    
# Convert list to DataFrame
info_df = pd.DataFrame(info_list, columns=['location', 'origin', 'roast', 'est_price', 
                                           'review_date', 'agtron', 'aroma', 'acid', 
                                           'body', 'flavor', 'aftertaste', 'with_milk', 
                                           'desc_1', 'desc_2', 'desc_3'])

# Display the first few rows
info_df.head()



Unnamed: 0,location,origin,roast,est_price,review_date,agtron,aroma,acid,body,flavor,aftertaste,with_milk,desc_1,desc_2,desc_3
0,"Madison, Wisconsin","Pitalito, Huila Department, Colombia",Light,$20.00/12 ounces,August 2024,63/85,9,9,9,9,8,,"Bright, balanced, juicy. Guava, dark chocolate...",Produced by smallholding members of the Asobom...,"A richly sweet-tart, high-toned Colombia Pink ..."
1,"Madison, Wisconsin","Puno, Peru",Medium-Light,$20.00/8 ounces,August 2024,58/80,9,9,8,9,8,,"Richly floral-toned, very sweet. Rhododendron,...","Produced by Wilson Sucaticona, entirely of the...",A washed Peru cup that offers chocolaty sweetn...
2,"Taipei, Taiwan","Boquete growing region, western Panama",Light,$90.00/4 ounces,August 2024,66/88,9,10,9,9,9,,"Intricate, delicate, laser-focused. Lychee, be...","Produced at Hacienda La Esmeralda, entirely of...",A vivid exemplar of Panama’s ongoing Geisha pr...
3,"Charlotte, North Carolina","Sabanilla de Alajuela, Central Valley, Costa Rica",Light,$28.00/12 ounces,August 2024,72/102,9,9,9,9,9,,"Very bright and juicy, tropical-leaning. Kiwi,...",Produced by Francisca and Oscar Chacon of Finc...,"A juicy, fruit-forward, floral-toned Costa Ric..."
4,"Madison, Wisconsin","Guji Zone, Oromia Region, southern Ethiopia",Light,$22.00/12 ounces,August 2024,62/86,9,9,8,9,8,,"Balanced, high-toned, multi-layered. Dried plu...","Produced by smallholding farmers, from trees o...",A savory throughline complicates this otherwis...


In [29]:
info_df.isnull().sum()

location          0
origin            0
roast             0
est_price         1
review_date       0
agtron           16
aroma            11
acid            293
body              0
flavor            0
aftertaste        1
with_milk      1966
desc_1            0
desc_2            0
desc_3            0
dtype: int64

In [30]:
df_coffee = pd.concat([coffee_df, info_df], axis =1)
df_coffee.head()

Unnamed: 0,slug,all_text,rating,roaster,name,location,origin,roast,est_price,review_date,agtron,aroma,acid,body,flavor,aftertaste,with_milk,desc_1,desc_2,desc_3
0,https://www.coffeereview.com/review/asobombo-c...,\n\n\n94\n\n\nJBC Coffee Roasters\nAsobombo C...,94,JBC Coffee Roasters,Asobombo Colombia,"Madison, Wisconsin","Pitalito, Huila Department, Colombia",Light,$20.00/12 ounces,August 2024,63/85,9,9,9,9,8,,"Bright, balanced, juicy. Guava, dark chocolate...",Produced by smallholding members of the Asobom...,"A richly sweet-tart, high-toned Colombia Pink ..."
1,https://www.coffeereview.com/review/alto-lagun...,\n\n\n93\n\n\nJBC Coffee Roasters\nAlto Lagun...,93,JBC Coffee Roasters,Alto Lagunillas by Wilson Sucaticona,"Madison, Wisconsin","Puno, Peru",Medium-Light,$20.00/8 ounces,August 2024,58/80,9,9,8,9,8,,"Richly floral-toned, very sweet. Rhododendron,...","Produced by Wilson Sucaticona, entirely of the...",A washed Peru cup that offers chocolaty sweetn...
2,https://www.coffeereview.com/review/panama-hac...,\n\n\n96\n\n\nEuphora Coffee\nPanama Hacienda...,96,Euphora Coffee,Panama Hacienda La Esmeralda Special Nano Geis...,"Taipei, Taiwan","Boquete growing region, western Panama",Light,$90.00/4 ounces,August 2024,66/88,9,10,9,9,9,,"Intricate, delicate, laser-focused. Lychee, be...","Produced at Hacienda La Esmeralda, entirely of...",A vivid exemplar of Panama’s ongoing Geisha pr...
3,https://www.coffeereview.com/review/costa-rica...,\n\n\n95\n\n\nMagnolia Coffee\nCosta Rica Las...,95,Magnolia Coffee,Costa Rica Las Lajas Black Diamond Bourbon,"Charlotte, North Carolina","Sabanilla de Alajuela, Central Valley, Costa Rica",Light,$28.00/12 ounces,August 2024,72/102,9,9,9,9,9,,"Very bright and juicy, tropical-leaning. Kiwi,...",Produced by Francisca and Oscar Chacon of Finc...,"A juicy, fruit-forward, floral-toned Costa Ric..."
4,https://www.coffeereview.com/review/biru-bekel...,\n\n\n93\n\n\nJBC Coffee Roasters\nBiru Bekel...,93,JBC Coffee Roasters,Biru Bekele Ethiopia,"Madison, Wisconsin","Guji Zone, Oromia Region, southern Ethiopia",Light,$22.00/12 ounces,August 2024,62/86,9,9,8,9,8,,"Balanced, high-toned, multi-layered. Dried plu...","Produced by smallholding farmers, from trees o...",A savory throughline complicates this otherwis...


In [31]:
df_coffee.to_csv('coffee_fix.csv', index=False)

In [32]:
df_coffee.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2265 entries, 0 to 2264
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   slug         2265 non-null   object
 1   all_text     2265 non-null   object
 2   rating       2265 non-null   object
 3   roaster      2265 non-null   object
 4   name         2265 non-null   object
 5   location     2265 non-null   object
 6   origin       2265 non-null   object
 7   roast        2265 non-null   object
 8   est_price    2264 non-null   object
 9   review_date  2265 non-null   object
 10  agtron       2249 non-null   object
 11  aroma        2254 non-null   object
 12  acid         1972 non-null   object
 13  body         2265 non-null   object
 14  flavor       2265 non-null   object
 15  aftertaste   2264 non-null   object
 16  with_milk    299 non-null    object
 17  desc_1       2265 non-null   object
 18  desc_2       2265 non-null   object
 19  desc_3       2265 non-null 

In [33]:
df_coffee.isnull().sum()

slug              0
all_text          0
rating            0
roaster           0
name              0
location          0
origin            0
roast             0
est_price         1
review_date       0
agtron           16
aroma            11
acid            293
body              0
flavor            0
aftertaste        1
with_milk      1966
desc_1            0
desc_2            0
desc_3            0
dtype: int64