In [25]:
import json
import pandas as pd
import numpy as np
import scipy
import gzip

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from lightfm import LightFM
from scipy.sparse import coo_matrix
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
dataset = []
file_path = 'ratebeer.json.gz'


with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):


        # Fix single quotes to double quotes
        line = line.replace("'", '"')
        try:
            dataset.append(json.loads(line))
        except json.JSONDecodeError as e:
            continue

In [80]:
df = pd.DataFrame(dataset)

In [81]:
def clean_rating(ratings):
    if type(ratings) == str:
        num = int(ratings.split('/')[0])
        den = int(ratings.split('/')[1])
        return num / den

In [82]:
def clean_abv(abv):
    if abv.isnumeric():
        return float(abv)
    else:
        return np.nan()

In [83]:
df['review/appearance'] = df['review/appearance'].apply(clean_rating)
df['review/aroma'] = df['review/aroma'].apply(clean_rating)
df['review/taste'] = df['review/taste'].apply(clean_rating)
df['review/overall'] = df['review/overall'].apply(clean_rating)
df['review/palate'] = df['review/palate'].apply(clean_rating)
df['beer/ABV'] = pd.to_numeric(df['beer/ABV'].replace('-', np.nan), errors='coerce')

In [84]:
top_4 = df['beer/style'].value_counts().index[:4].tolist()
top_10 = df['beer/style'].value_counts().index[:10].tolist()

In [85]:
top4_df = df[df['beer/style'].isin(top_4)]
top10_df = df[df['beer/style'].isin(top_10)]

In [86]:
train_thresh4 = int((len(top4_df) * 0.8) // 1)

train_thresh10 = int((len(top10_df) * 0.8) // 1)

In [87]:
train4 = top4_df[:train_thresh4]
test4 = top4_df[train_thresh4:]

train10 = top10_df[:train_thresh10]
test10 = top10_df[train_thresh10:]

In [88]:
train_X4 = train4[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
train_y4 = train4['beer/style']
test_X4 = test4[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
test_y4 = test4['beer/style']

train_X10 = train10[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
train_y10 = train10['beer/style']
test_X10 = test10[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
test_y10 = test10['beer/style']

In [89]:
model4 = LogisticRegression()
model4.fit(train_X4, train_y4)

model10 = LogisticRegression()
model10.fit(train_X10, train_y10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [90]:
preds4 = model4.predict(test_X4)

preds10 = model10.predict(test_X10)

In [91]:
print(f"Accuracy for model 4: {(test_y4.values == preds4).mean()}")

print(f"Accuracy for model 10: {(test_y10.values == preds10).mean()}")

Accuracy for model 4: 0.5329164527250055
Accuracy for model 10: 0.26604071060559137


# Baseline with ABV

In [100]:
train4 = train4.dropna(subset = ['beer/ABV'])
test4 = test4.dropna(subset = ['beer/ABV'])
train10 = train10.dropna(subset = ['beer/ABV'])
test10 = test10.dropna(subset = ['beer/ABV'])

In [101]:
train_X4 = train4[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall', 'beer/ABV']]
train_y4 = train4['beer/style']
test_X4 = test4[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall', 'beer/ABV']]
test_y4 = test4['beer/style']

train_X10 = train10[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall', 'beer/ABV']]
train_y10 = train10['beer/style']
test_X10 = test10[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall', 'beer/ABV']]
test_y10 = test10['beer/style']

In [102]:
model4 = LogisticRegression()
model4.fit(train_X4, train_y4)

model10 = LogisticRegression()
model10.fit(train_X10, train_y10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [103]:
preds4 = model4.predict(test_X4)

preds10 = model10.predict(test_X10)

In [104]:
print(f"Accuracy for model 4: {(test_y4.values == preds4).mean()}")

print(f"Accuracy for model 10: {(test_y10.values == preds10).mean()}")

Accuracy for model 4: 0.7906065060876777
Accuracy for model 10: 0.4623434601113173


# Refined Model

In [21]:
def stratified_sample(df, group_col, n_samples):
    # Calculate the proportion of each group in the dataset
    proportions = df[group_col].value_counts(normalize=True)
    
    # Determine the number of samples for each group based on the proportions
    sample_counts = (proportions * n_samples).round().astype(int)
    
    # Sample rows for each group
    sampled_df = pd.concat([
        df[df[group_col] == group].sample(n=min(count, len(df[df[group_col] == group])), random_state=42)
        for group, count in sample_counts.items()
    ])
    
    return sampled_df

In [24]:
top4_sampled = stratified_sample(top4_df, group_col="beer/style", n_samples=200000)
top10_sampled = stratified_sample(top10_df, group_col="beer/style", n_samples=200000)

In [41]:
custom_stop_words = ['ipa', 'stout', 'belgian', 'lager', 'pale', 'india',
                    'ipas', 'stouts', 'lagers', 'pales','ale', 'ales', 'dipa',
                    'dipas']  # Add words you want to exclude

# Top 4 Model

In [37]:
# Step 1: Prepare the data
top4_sampled['beer/style'] = LabelEncoder().fit_transform(top4_sampled['beer/style'])  # Encode the target variable
top4_sampled['clean_text'] = top4_sampled['review/text'].str.lower().str.replace('[^\w\s]', '', regex=True)  # Clean text data

vectorizer4 = TfidfVectorizer(max_features=100, stop_words=custom_stop_words)  # Limit features for simplicity
text_features4 = vectorizer4.fit_transform(top4_sampled['clean_text']).toarray()

# Step 3: Combine numerical features and text features
numerical_features4 = top4_sampled[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
combined_features4 = np.hstack((numerical_features4, text_features4))  # Combine arrays

# Step 4: Split data
X_train4, X_test4, y_train4, y_test4 = train_test_split(combined_features4, top4_sampled['beer/style'], test_size=0.2, random_state=42)

# Step 5: Train a model
model4 = LogisticRegression(max_iter=500)  # Increase max_iter to avoid convergence issues
model4.fit(X_train4, y_train4)

# Step 6: Make predictions and evaluate
preds4 = model4.predict(X_test4)
accuracy4 = accuracy_score(y_test4, preds4)
print(f"Model Accuracy: {accuracy4}")

Model Accuracy: 0.84695


In [38]:
vectorizer4.get_feature_names_out()

array(['alcohol', 'all', 'amber', 'an', 'and', 'aroma', 'as', 'at', 'be',
       'beer', 'big', 'bit', 'bitter', 'bitterness', 'black', 'body',
       'bottle', 'brown', 'but', 'by', 'caramel', 'carbonation',
       'chocolate', 'citrus', 'clear', 'coffee', 'color', 'creamy',
       'dark', 'dry', 'finish', 'flavor', 'for', 'from', 'fruit',
       'fruity', 'golden', 'good', 'great', 'had', 'has', 'have', 'head',
       'hop', 'hoppy', 'hops', 'in', 'is', 'it', 'its', 'just', 'lacing',
       'light', 'like', 'little', 'malt', 'malts', 'malty', 'medium',
       'more', 'mouthfeel', 'much', 'my', 'nice', 'no', 'nose', 'not',
       'notes', 'of', 'on', 'one', 'orange', 'palate', 'pine', 'pours',
       'quite', 'really', 'roasted', 'slightly', 'small', 'smooth',
       'some', 'strong', 'sweet', 'taste', 'than', 'that', 'the', 'there',
       'thick', 'thin', 'this', 'to', 'too', 'very', 'was', 'well',
       'white', 'with', 'you'], dtype=object)

# Top 10 Model

In [39]:
# Step 1: Prepare the data
top10_sampled['beer/style'] = LabelEncoder().fit_transform(top10_sampled['beer/style'])  # Encode the target variable
top10_sampled['clean_text'] = top10_sampled['review/text'].str.lower().str.replace('[^\w\s]', '', regex=True)  # Clean text data

vectorizer10 = TfidfVectorizer(max_features=100, stop_words=custom_stop_words)  # Limit features for simplicity
text_features10 = vectorizer10.fit_transform(top10_sampled['clean_text']).toarray()

# Step 3: Combine numerical features and text features
numerical_features10 = top10_sampled[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
combined_features10 = np.hstack((numerical_features10, text_features10))  # Combine arrays

# Step 4: Split data
X_train10, X_test10, y_train10, y_test10 = train_test_split(combined_features10, top10_sampled['beer/style'], test_size=0.2, random_state=42)

# Step 5: Train a model
model10 = LogisticRegression(max_iter=500)  # Increase max_iter to avoid convergence issues
model10.fit(X_train10, y_train10)

# Step 6: Make predictions and evaluate
preds10 = model10.predict(X_test10)
accuracy10 = accuracy_score(y_test10, preds10)
print(f"Model Accuracy: {accuracy10}")

Model Accuracy: 0.53925


In [40]:
vectorizer10.get_feature_names_out()

array(['alcohol', 'all', 'amber', 'an', 'and', 'are', 'aroma', 'as', 'at',
       'be', 'beer', 'big', 'bit', 'bitter', 'bitterness', 'black',
       'body', 'bottle', 'brown', 'but', 'by', 'caramel', 'carbonation',
       'chocolate', 'citrus', 'clear', 'coffee', 'color', 'creamy',
       'dark', 'dry', 'finish', 'flavor', 'for', 'from', 'fruit',
       'fruity', 'golden', 'good', 'grapefruit', 'great', 'had', 'has',
       'head', 'hop', 'hoppy', 'hops', 'in', 'is', 'it', 'its', 'just',
       'lacing', 'light', 'like', 'little', 'malt', 'malts', 'malty',
       'medium', 'more', 'mouthfeel', 'much', 'my', 'nice', 'no', 'nose',
       'not', 'notes', 'of', 'on', 'one', 'orange', 'palate', 'pine',
       'pours', 'quite', 'really', 'roasted', 'slightly', 'small',
       'smooth', 'some', 'strong', 'sweet', 'sweetness', 'taste', 'than',
       'that', 'the', 'there', 'thin', 'this', 'to', 'too', 'very', 'was',
       'well', 'white', 'with'], dtype=object)