In [1]:
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
beer_reviews = pd.read_csv("../data/beer_reviews.csv")

In [3]:
# Drop rows with missing values
beer_reviews = beer_reviews.dropna()
beer_reviews.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [4]:
# Group the reviews based on ´review_profilename´ and sort them by ´review_time´
grouped_reviews = beer_reviews.sort_values(
    by="review_time").groupby("review_profilename")
grouped_reviews.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
564601,33,Berkshire Brewing Company Inc.,840672001,4.0,3.5,3.5,Todd,American Pale Ale (APA),4.0,4.0,Steel Rail Extra Pale Ale,5.3,93
286273,35,Boston Beer Company (Samuel Adams),884390401,4.0,4.0,3.0,Todd,American Strong Ale,4.5,4.5,Samuel Adams Triple Bock,17.5,111
764128,144,Sprecher Brewing Company,884649601,4.5,4.0,4.0,BeerAdvocate,Vienna Lager,4.0,4.0,Special Amber,5.0,97
1417077,139,Shipyard Brewing Co.,885340801,4.0,3.5,3.0,BeerAdvocate,English Pale Ale,3.5,4.0,Tremont Ale,4.8,51
78405,140,Sierra Nevada Brewing Co.,886723201,4.5,4.0,4.0,BeerAdvocate,American Barleywine,4.0,4.5,Sierra Nevada Bigfoot Barleywine Style Ale,9.6,2671
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74285,140,Sierra Nevada Brewing Co.,1326269375,4.0,4.0,4.0,GardenWaters,American Stout,4.0,4.5,Sierra Nevada Stout,5.8,283
852288,417,August Schell Brewing Company,1326272107,2.0,2.0,2.0,libbey,American Adjunct Lager,2.0,2.5,Schell's Deer Brand,4.8,4886
989617,859,Snoqualmie Falls Brewing Company & Taproom,1326273106,3.0,3.0,4.5,libbey,American Double / Imperial IPA,3.5,3.0,Plant 1 Powerhouse IPA,7.5,69614
562035,423,Boulevard Brewing Co.,1326274454,4.5,4.0,4.0,libbey,Quadrupel (Quad),4.5,4.5,The Sixth Glass,10.5,39621


In [5]:
# Per user, find a sequence of beers that leads up to the highest rated beer and save it for each user as a dataframe
users = []

for user_name, reviews_df in grouped_reviews:
    # Sort the reviews by time (review_time)
    reviews_df = reviews_df.sort_values(by="review_time")

    # Find the index of the highest rated beer
    max_index = reviews_df["review_overall"].idxmax()

    # Create a dataframe with all the beers up to the highest rated beer
    user_df = reviews_df.iloc[:max_index + 1]

    # Add the dataframe to the list
    users.append(user_df)


In [6]:
# Print the max and average length of the sequences
max_length = 0
total_length = 0
for user in users:
    length = len(user)
    total_length += length
    if length > max_length:
        max_length = length

print("Max length: ", max_length)
print("Average length: ", total_length / len(users))


Max length:  5346
Average length:  46.14312629147928


In [7]:
# Trim the sequences to a maximum length of 100
max_length = 100
for i, user in enumerate(users):
    if len(user) > max_length:
        users[i] = user.iloc[-max_length:]

# Remove the users with sequences shorter than 3
min_length = 3
users = [user for user in users if len(user) >= min_length]

# Print the max and average length of the sequences
max_length = 0
total_length = 0
for user in users:
    length = len(user)
    total_length += length
    if length > max_length:
        max_length = length

print("Max length: ", max_length)
print("Average length: ", total_length / len(users))


Max length:  100
Average length:  31.95968175465004


In [8]:
# Print the total number of reviews
total_reviews = 0
for user in users:
    total_reviews += len(user)

print("Total number of reviews: ", total_reviews)

Total number of reviews:  594514


In [9]:
# Nomralize the scales of review_overall review_aroma review_appearance review_taste review_palate from 0-5 to 0-1
for user in users:
    user["review_overall"] = user["review_overall"] / 5
    user["review_aroma"] = user["review_aroma"] / 5
    user["review_appearance"] = user["review_appearance"] / 5
    user["review_taste"] = user["review_taste"] / 5
    user["review_palate"] = user["review_palate"] / 5


In [10]:
# Normalize the scale of beer_abv from 0-100 to 0-1
for user in users:
    user["beer_abv"] = user["beer_abv"] / 100


In [11]:
print(users[0].head(1))

        brewery_id           brewery_name  review_time  review_overall  \
246563         215  Bières de Chimay S.A.   1220593164             1.0   

        review_aroma  review_appearance review_profilename  \
246563           0.8                1.0           0110x011   

                     beer_style  review_palate  review_taste  \
246563  Belgian Strong Dark Ale            0.9           0.8   

                           beer_name  beer_abv  beer_beerid  
246563  Chimay Grande Réserve (Blue)      0.09         2512  


In [15]:
# Encode the beer names to integers
beer_styles = beer_reviews["beer_style"].unique()
print("Number of unique beers: ", len(beer_styles))

beer_style_to_int = {}
int_to_beer_style = {}

for i, beer_style in enumerate(beer_styles):
    beer_style_to_int[beer_style] = i
    int_to_beer_style[i] = beer_style

# TODO: USE ONE-HOT TO LAZY TO INTE ENDCODE WITH HIRACHY


Number of unique beers:  104
        brewery_id           brewery_name  review_time  review_overall  \
246563         215  Bières de Chimay S.A.   1220593164             1.0   

        review_aroma  review_appearance review_profilename  beer_style  \
246563           0.8                1.0           0110x011          12   

        review_palate  review_taste                     beer_name  beer_abv  \
246563            0.9           0.8  Chimay Grande Réserve (Blue)      0.09   

        beer_beerid  
246563         2512  
