In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

import math

In [3]:
reviewsDF = pd.read_csv('beer_reviews.csv')

In [4]:
reviewsDF.head()
reviewsDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [5]:
reviews_per_beerid = reviewsDF.groupby('beer_beerid').review_overall.count().to_frame('Reviews_count').sort_values(by = "Reviews_count", ascending=False)

In [6]:
filtered_reviews_per_beerid = reviews_per_beerid[reviews_per_beerid['Reviews_count'] > 10]

In [7]:
filteredReviewsDF_by_beer = reviewsDF[reviewsDF['beer_beerid'].isin(filtered_reviews_per_beerid.index)]

In [8]:
reviews_per_user = reviewsDF.groupby('review_profilename').review_overall.count().to_frame('Reviews_count').sort_values(by = "Reviews_count", ascending=False)
filtered_reviews_per_user = reviews_per_user[reviews_per_user['Reviews_count'] > 10]
filtered_reviews_per_user.head()

Unnamed: 0_level_0,Reviews_count
review_profilename,Unnamed: 1_level_1
northyorksammy,5817
BuckeyeNation,4661
mikesgroove,4617
Thorpe429,3518
womencantsail,3497


In [9]:
filteredReviewsDF = filteredReviewsDF_by_beer[filteredReviewsDF_by_beer['review_profilename'].isin(filtered_reviews_per_user.index)]

In [10]:
filteredReviewsDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1387322 entries, 10 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1387322 non-null  int64  
 1   brewery_name        1387322 non-null  object 
 2   review_time         1387322 non-null  int64  
 3   review_overall      1387322 non-null  float64
 4   review_aroma        1387322 non-null  float64
 5   review_appearance   1387322 non-null  float64
 6   review_profilename  1387322 non-null  object 
 7   beer_style          1387322 non-null  object 
 8   review_palate       1387322 non-null  float64
 9   review_taste        1387322 non-null  float64
 10  beer_name           1387322 non-null  object 
 11  beer_abv            1357104 non-null  float64
 12  beer_beerid         1387322 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 148.2+ MB


In [11]:
filteredReviewsDF = filteredReviewsDF.dropna(subset=['review_overall', 'review_aroma', 'review_appearance', 'review_profilename', 'review_palate', 'review_taste', 'beer_name', 'beer_beerid'])

In [12]:
filteredReviewsDF = filteredReviewsDF[['review_aroma', 'review_appearance', 'beer_abv', 'review_overall']]

In [13]:
filteredReviewsDF['review_overall'] = filteredReviewsDF['review_overall'].apply(lambda x: math.floor(x))
filteredReviewsDF.dropna(inplace=True)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(filteredReviewsDF.drop('review_overall',axis=1), 
                                                    filteredReviewsDF['review_overall'], test_size=0.30, 
                                                    random_state=101)

In [16]:
from mord import LogisticAT


In [17]:
model_ordinal = LogisticAT(alpha=0)

In [18]:
pred = model_ordinal.fit(X_train, y_train)

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer

In [20]:
MAE = make_scorer(mean_absolute_error)
folds = 5


In [21]:
MAE_ordinal = cross_val_score(model_ordinal,
    X_train,
    y_train,
    cv=folds,
    scoring=MAE)
print('Ordered logistic regression: ', np.mean(MAE_ordinal))

Ordered logistic regression:  0.37005617034610355


In [22]:
from sklearn.metrics import accuracy_score

def acc_fun(target_true, target_fit):
    target_fit = np.round(target_fit)
    target_fit.astype('int')
    return accuracy_score(target_true, target_fit)

acc = make_scorer(acc_fun)
folds = 5

acc_ordinal = cross_val_score(model_ordinal,
    X_train,
    y_train,
    cv=folds,
    scoring=acc)
print('Ordered logistic regression: ', np.mean(acc_ordinal))

Ordered logistic regression:  0.6529866141782281


In [23]:
MAE_ordinal = cross_val_score(model_ordinal,
    X_test,
    y_test,
    scoring=MAE)
print('Ordered logistic regression: ', np.mean(MAE_ordinal))

Ordered logistic regression:  0.3720267639967607
