In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

import math

In [2]:
reviewsDF = pd.read_csv('beer_reviews.csv')

In [3]:
reviewsDF.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [4]:
reviews_per_beerid = reviewsDF.groupby('beer_beerid').review_overall.count().to_frame('Reviews_count').sort_values(by = "Reviews_count", ascending=False)

In [5]:
filtered_reviews_per_beerid = reviews_per_beerid[reviews_per_beerid['Reviews_count'] > 10]
filtered_reviews_per_beerid.index

Int64Index([ 2093,   412,  1904,  1093,  4083,    92,   276,    88,  7971,
            11757,
            ...
            38012, 62790, 66563, 32748,  2735, 13375, 13966,  2555, 14962,
            69702],
           dtype='int64', name='beer_beerid', length=13389)

In [6]:
filteredReviewsDF_by_beer = reviewsDF[reviewsDF['beer_beerid'].isin(filtered_reviews_per_beerid.index)]

In [7]:
reviews_per_user = reviewsDF.groupby('review_profilename').review_overall.count().to_frame('Reviews_count').sort_values(by = "Reviews_count", ascending=False)
filtered_reviews_per_user = reviews_per_user[reviews_per_user['Reviews_count'] > 10]
filtered_reviews_per_user.head()

Unnamed: 0_level_0,Reviews_count
review_profilename,Unnamed: 1_level_1
northyorksammy,5817
BuckeyeNation,4661
mikesgroove,4617
Thorpe429,3518
womencantsail,3497


In [8]:
filteredReviewsDF = filteredReviewsDF_by_beer[filteredReviewsDF_by_beer['review_profilename'].isin(filtered_reviews_per_user.index)]

In [9]:
filteredReviewsDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1387322 entries, 10 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1387322 non-null  int64  
 1   brewery_name        1387322 non-null  object 
 2   review_time         1387322 non-null  int64  
 3   review_overall      1387322 non-null  float64
 4   review_aroma        1387322 non-null  float64
 5   review_appearance   1387322 non-null  float64
 6   review_profilename  1387322 non-null  object 
 7   beer_style          1387322 non-null  object 
 8   review_palate       1387322 non-null  float64
 9   review_taste        1387322 non-null  float64
 10  beer_name           1387322 non-null  object 
 11  beer_abv            1357104 non-null  float64
 12  beer_beerid         1387322 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 148.2+ MB


In [10]:
filteredReviewsDF = filteredReviewsDF.dropna(subset=['review_overall', 'review_aroma', 'review_appearance', 'review_profilename', 'review_palate', 'review_taste', 'beer_name', 'beer_beerid'])

In [28]:
filteredReviewsDF = filteredReviewsDF[['review_aroma', 'review_appearance', 'beer_abv', 'review_overall']]

In [46]:
filteredReviewsDF['review_overall'] = filteredReviewsDF['review_overall'].apply(lambda x: math.floor(x))
filteredReviewsDF.dropna(inplace=True)

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(filteredReviewsDF.drop('review_overall',axis=1), 
                                                    filteredReviewsDF['review_overall'], test_size=0.30, 
                                                    random_state=101)

In [49]:
from mord import LogisticAT


In [50]:
model_ordinal = LogisticAT(alpha=0)

In [52]:
pred = model_ordinal.fit(X_train, y_train)

In [56]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer

In [57]:
MAE = make_scorer(mean_absolute_error)
folds = 5


In [59]:
MAE_ordinal = cross_val_score(model_ordinal,
    X_train,
    y_train,
    cv=folds,
    scoring=MAE)
print('Ordered logistic regression: ', np.mean(MAE_ordinal))

Ordered logistic regression:  0.37005617034610355


In [61]:
from sklearn.metrics import accuracy_score

def acc_fun(target_true, target_fit):
    target_fit = np.round(target_fit)
    target_fit.astype('int')
    return accuracy_score(target_true, target_fit)

acc = make_scorer(acc_fun)
folds = 5

acc_ordinal = cross_val_score(model_ordinal,
    X_train,
    y_train,
    cv=folds,
    scoring=acc)
print('Ordered logistic regression: ', np.mean(acc_ordinal))

Ordered logistic regression:  0.6529866141782281


In [62]:
MAE_ordinal = cross_val_score(model_ordinal,
    X_test,
    y_test,
    scoring=MAE)
print('Ordered logistic regression: ', np.mean(MAE_ordinal))

Ordered logistic regression:  0.3720267639967607
