In [10]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

import math

In [11]:
reviewsDF = pd.read_csv('beer_reviews.csv')

In [12]:
reviewsDF.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [13]:
reviews_per_beerid = reviewsDF.groupby('beer_beerid').review_overall.count().to_frame('Reviews_count').sort_values(by = "Reviews_count", ascending=False)
filtered_reviews_per_beerid = reviews_per_beerid[reviews_per_beerid['Reviews_count'] > 10]

filteredReviewsDF_by_beer = reviewsDF[reviewsDF['beer_beerid'].isin(filtered_reviews_per_beerid.index)]

reviews_per_user = reviewsDF.groupby('review_profilename').review_overall.count().to_frame('Reviews_count').sort_values(by = "Reviews_count", ascending=False)
filtered_reviews_per_user = reviews_per_user[reviews_per_user['Reviews_count'] > 10]

filteredReviewsDF = filteredReviewsDF_by_beer[filteredReviewsDF_by_beer['review_profilename'].isin(filtered_reviews_per_user.index)]

filteredReviewsDF = filteredReviewsDF.dropna(subset=['review_overall', 'review_aroma', 'review_appearance', 'review_profilename', 'review_palate', 'review_taste', 'beer_name', 'beer_beerid'])

filteredReviewsDF = filteredReviewsDF[['review_aroma', 'review_appearance', 'beer_abv', 'review_overall']]

filteredReviewsDF['review_overall'] = filteredReviewsDF['review_overall'].apply(lambda x: math.floor(x))
filteredReviewsDF.dropna(inplace=True)

In [14]:
from sklearn.model_selection import train_test_split

from mord import LogisticAT, LogisticIT, LAD, OrdinalRidge

In [15]:
X_train, X_test, y_train, y_test = train_test_split(filteredReviewsDF.drop('review_overall',axis=1), 
                                                    filteredReviewsDF['review_overall'], test_size=0.30, 
                                                    random_state=101)


In [16]:
model_AT = LogisticAT(alpha=0)
pred_AT = model_AT.fit(X_train, y_train)
results_AT = pred_AT.predict(X_test)

In [17]:
model_IT = LogisticIT(alpha=0)
pred_IT = model_IT.fit(X_train, y_train)
results_IT = pred_IT.predict(X_test)

In [18]:
model_LAD = LAD()
pred_LAD = model_LAD.fit(X_train, y_train)
results_LAD = pred_LAD.predict(X_test)



In [19]:
model_Ordinal_Ridge = OrdinalRidge(alpha=0)
pred_Ordinal = model_Ordinal_Ridge.fit(X_train, y_train)
results_Ordinal = pred_Ordinal.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [21]:
print('LogisticAT: ', accuracy_score(y_test, results_AT))

LogisticAT:  0.6513121051649097


In [22]:
print('LogisticIT: ', accuracy_score(y_test, results_IT))

LogisticIT:  0.6512040321075229


In [23]:
print('LAD: ', accuracy_score(y_test, results_LAD))


LAD:  0.6489909906369433


In [24]:
print('OrdinalRidge: ', accuracy_score(y_test, results_Ordinal))

OrdinalRidge:  0.647143923837969


In [25]:
from sklearn.metrics import confusion_matrix


In [26]:
print('LogisticAT: ', confusion_matrix(y_test, results_AT))

LogisticAT:  [[   915   2449   1587    468      0]
 [   287   5187  13107   4717      0]
 [    96   2843  49119  66594      0]
 [    24    858  26255 209942      3]
 [    17    100   1028  21529      7]]


In [27]:
print('LogisticIT: ', confusion_matrix(y_test, results_IT))

LogisticIT:  [[  1708   1707   1486    518      0]
 [   985   4655  12473   5185      0]
 [   343   2716  44332  71261      0]
 [    92    819  21741 214430      0]
 [    34     85    925  21636      1]]


In [28]:
print('LAD: ', confusion_matrix(y_test, results_LAD))


LAD:  [[    16   2690   2270    441      2]
 [     0   3266  15662   4355     15]
 [     0   1388  56853  60313     98]
 [     0    417  33133 202369   1163]
 [     0     75   1242  19643   1721]]


In [29]:
print('OrdinalRidge: ', confusion_matrix(y_test, results_Ordinal))

OrdinalRidge:  [[   180   2823   2061    353      2]
 [    25   3987  15723   3554      9]
 [     5   1805  64275  52509     58]
 [     2    530  41953 193875    722]
 [     3     83   1509  19930   1156]]
