In [1]:
import json
import pandas as pd
import numpy as np
import scipy
import gzip

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from lightfm import LightFM
from scipy.sparse import coo_matrix
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k



In [2]:
dataset = []
file_path = 'ratebeer.json.gz'


with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):


        # Fix single quotes to double quotes
        line = line.replace("'", '"')
        try:
            dataset.append(json.loads(line))
        except json.JSONDecodeError as e:
            continue

In [3]:
dataset

[{'beer/name': 'John Harvards Simcoe IPA',
  'beer/beerId': '63836',
  'beer/brewerId': '8481',
  'beer/ABV': '5.4',
  'beer/style': 'India Pale Ale &#40;IPA&#41;',
  'review/appearance': '4/5',
  'review/aroma': '6/10',
  'review/palate': '3/5',
  'review/taste': '6/10',
  'review/overall': '13/20',
  'review/time': '1157587200',
  'review/profileName': 'hopdog',
  'review/text': 'On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.'},
 {'beer/name': 'John Harvards Simcoe IPA',
  'beer/beerId': '63836',
  'beer/brewerId': '8481',
  'beer/ABV': '5.4',
  'beer/style': 'India Pale Ale &#40;IPA&#41;',
  'review/appearance': '4/5',
  'review/aroma': '6/10',

In [29]:
df = pd.DataFrame(dataset)

In [30]:
def clean_rating(ratings):
    if type(ratings) == str:
        num = int(ratings.split('/')[0])
        den = int(ratings.split('/')[1])
        return num / den

In [31]:
top_4 = df['beer/style'].value_counts().index[:10].tolist()

In [32]:
df = df[df['beer/style'].isin(top_4)]

In [33]:
df['review/appearance'] = df['review/appearance'].apply(clean_rating)
df['review/aroma'] = df['review/aroma'].apply(clean_rating)
df['review/taste'] = df['review/taste'].apply(clean_rating)
df['review/overall'] = df['review/overall'].apply(clean_rating)
df['review/palate'] = df['review/palate'].apply(clean_rating)

In [44]:
train_thresh = int((len(df) * 0.8) // 1)
test_thresh = int(train_thresh + ((len(df) * 0.1) // 1))

In [45]:
train = df[:train_thresh]
test = df[train_thresh: test_thresh]
validate = df[test_thresh:]

In [46]:
train_X = train[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
train_y = train['beer/style']
test_X = test[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
test_y = test['beer/style']
validate_X = validate[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
validate_y = validate['beer/style']

In [47]:
model = LogisticRegression()
model.fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
preds = model.predict(test_X)

In [49]:
(test_y.values == preds).mean()

0.2538195878223657

In [36]:
tester = df[:100000]

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Prepare the data
tester['beer/style'] = LabelEncoder().fit_transform(tester['beer/style'])  # Encode the target variable
tester['clean_text'] = tester['review/text'].str.lower().str.replace('[^\w\s]', '', regex=True)  # Clean text data

# Step 2: Vectorize the text
custom_stop_words = ['ipa', 'stout', 'belgian', 'lager', 'pale', 'india']  # Add words you want to exclude

vectorizer = TfidfVectorizer(max_features=1000, stop_words=custom_stop_words)  # Limit features for simplicity
text_features = vectorizer.fit_transform(tester['clean_text']).toarray()

# Step 3: Combine numerical features and text features
numerical_features = tester[['review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']]
combined_features = np.hstack((numerical_features, text_features))  # Combine arrays

# Step 4: Split data
X_train, X_test, y_train, y_test = train_test_split(combined_features, tester['beer/style'], test_size=0.2, random_state=42)

# Step 5: Train a model
model = LogisticRegression(max_iter=500)  # Increase max_iter to avoid convergence issues
model.fit(X_train, y_train)

# Step 6: Make predictions and evaluate
preds = model.predict(X_test)
accuracy = accuracy_score(y_test, preds)
print(f"Model Accuracy: {accuracy}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tester['beer/style'] = LabelEncoder().fit_transform(tester['beer/style'])  # Encode the target variable
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tester['clean_text'] = tester['review/text'].str.lower().str.replace('[^\w\s]', '', regex=True)  # Clean text data


Model Accuracy: 0.67595


In [43]:
vectorizer.get_feature_names_out()

array(['10', '11', '12', '12oz', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '22', '22oz', '500ml', '610',
       '710', '750', '750ml', 'about', 'above', 'absolutely', 'abv',
       'acidic', 'acidity', 'across', 'actually', 'add', 'added', 'adds',
       'after', 'aftertaste', 'again', 'age', 'aged', 'aggressive',
       'aging', 'alcohol', 'alcoholic', 'ale', 'ales', 'all', 'almost',
       'along', 'also', 'although', 'always', 'am', 'amazing', 'amber',
       'american', 'amount', 'an', 'and', 'anise', 'another', 'any',
       'anything', 'apa', 'apparent', 'appearance', 'apple', 'apples',
       'apr', 'apricot', 'are', 'aroma', 'aromas', 'aromatic', 'around',
       'as', 'astringency', 'astringent', 'at', 'aug', 'avec', 'average',
       'away', 'awesome', 'back', 'backbone', 'background', 'backing',
       'bad', 'balance', 'balanced', 'banana', 'bar', 'barely', 'barley',
       'barleywine', 'barrel', 'base', 'batch', 'be', 'beautiful',
    

In [39]:
df['beer/style'].value_counts() / len(df)

beer/style
India Pale Ale &#40;IPA&#41;    0.166878
Pale Lager                      0.123847
Belgian Strong Ale              0.111227
Imperial Stout                  0.109810
Imperial/Double IPA             0.095116
American Pale Ale               0.091030
Porter                          0.084768
Barley Wine                     0.080410
Fruit Beer                      0.071251
Bitter                          0.065663
Name: count, dtype: float64

In [40]:
tester['beer/style'].value_counts() / len(tester)

beer/style
India Pale Ale &#40;IPA&#41;    0.20113
Belgian Strong Ale              0.14247
Imperial/Double IPA             0.12189
Imperial Stout                  0.09402
Porter                          0.09118
Bitter                          0.09057
American Pale Ale               0.08887
Barley Wine                     0.08886
Pale Lager                      0.04244
Fruit Beer                      0.03857
Name: count, dtype: float64

In [16]:
df

Unnamed: 0,beer/name,beer/beerId,beer/brewerId,beer/ABV,beer/style,review/appearance,review/aroma,review/palate,review/taste,review/overall,review/time,review/profileName,review/text
0,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale &#40;IPA&#41;,0.8,0.6,0.6,0.6,0.65,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."
1,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale &#40;IPA&#41;,0.8,0.6,0.8,0.7,0.65,1157241600,TomDecapolis,On tap at the John Harvards in Springfield PA....
9,John Harvards Yin Yang Lager,64126,8481,4.5,Pale Lager,0.6,0.6,0.6,0.6,0.60,1157587200,TomDecapolis,On tap at Springfield location. Pours a trans...
10,John Harvards Yin Yang Lager,64126,8481,4.5,Pale Lager,0.6,0.5,0.4,0.5,0.45,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."
109,Barley Island Barfly IPA,58511,3228,6.5,India Pale Ale &#40;IPA&#41;,0.6,0.7,0.8,0.7,0.85,1225238400,adamlangolf,Another quality session IPA. The lowest ABV o...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2783571,Library Bantam IPA,10477,1919,-,India Pale Ale &#40;IPA&#41;,0.6,0.6,0.6,0.6,0.60,1094601600,Aubrey,Clear and light golden; creamy head with nice ...
2783572,Library Bantam IPA,10477,1919,-,India Pale Ale &#40;IPA&#41;,0.6,0.6,0.6,0.5,0.55,1072051200,3fourths,Soft clear light yellow appearance. Smooth pa...
2783573,Library Bantam IPA,10477,1919,-,India Pale Ale &#40;IPA&#41;,0.8,0.7,0.8,0.9,0.95,1004745600,Dragon99,A nice hop profile. Very hoppy and a slight m...
2783626,Skytop McNutts Bitter Ale,86115,6102,-,India Pale Ale &#40;IPA&#41;,0.8,0.6,0.8,0.7,0.75,1207440000,coldbrewky,A seasonal offering from Skytop. Draft at the...
