In [1]:
# import libraries

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import config

In [2]:
#sqlalcehmy database connection

database = f'postgres://{config.user}:{config.db_password}@{config.host}/postgres'
engine = create_engine(database)

In [3]:
# Reading in our joined data

beer_db = pd.read_sql_query('''SELECT *
                    FROM beers as be
                    LEFT JOIN breweries as br
                    ON (be.brewery_id = br.brewery_id)
                    ;''',con=engine)
beer_db

Unnamed: 0,abv,ibu,beer_id,beer_name,style,style_group,brewery_id,ounces,brewery_id.1,brewery_name,city,state
0,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),American Pale Ale (APA),177,12.0,177,18th Street Brewery,Gary,IN
1,0.099,92.0,1036,Lower De Boom,American Barleywine,other,368,8.4,368,21st Amendment Brewery,San Francisco,CA
2,0.079,45.0,1024,Fireside Chat,Winter Warmer,other,368,12.0,368,21st Amendment Brewery,San Francisco,CA
3,0.044,42.0,876,Bitter American,American Pale Ale (APA),American Pale Ale (APA),368,12.0,368,21st Amendment Brewery,San Francisco,CA
4,0.049,17.0,802,Hell or High Watermelon Wheat (2009),Fruit / Vegetable Beer,Fruit / Vegetable Beer,368,12.0,368,21st Amendment Brewery,San Francisco,CA
...,...,...,...,...,...,...,...,...,...,...,...,...
1321,0.077,30.0,1513,Lights Out Vanilla Cream Extra Stout,American Double / Imperial IPA,American Double / Imperial IPA,199,12.0,199,Worthy Brewing Company,Bend,OR
1322,0.069,69.0,1512,Worthy IPA (2013),American IPA,American IPA,199,12.0,199,Worthy Brewing Company,Bend,OR
1323,0.060,50.0,1511,Worthy Pale,American Pale Ale (APA),American Pale Ale (APA),199,12.0,199,Worthy Brewing Company,Bend,OR
1324,0.067,45.0,928,Belgorado,Belgian IPA,other,424,12.0,424,Wynkoop Brewing Company,Denver,CO


In [5]:
# Seperate dataset into features (X) and target (y)

X = beer_db[['abv', 'ibu']] # Features
y = beer_db['style_group'] # Target

# splitting the dataset into train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=48)

In [6]:
# Creating scaler instance

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(copy=False)
X_scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train

1242                       other
1156                       other
713     American Amber / Red Ale
200              American Porter
671                        other
                  ...           
454                 American IPA
966                        other
944                        other
347                        other
563                        other
Name: style_group, Length: 994, dtype: object

In [7]:
# Building a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, random_state=48)
rfc.fit(X_train, y_train)

# Predict

y_pred=rfc.predict(X_test)

In [8]:
# Checking the accuracy of the model

from sklearn.metrics import classification_report
from sklearn import metrics

print(classification_report(y_test,y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

                                precision    recall  f1-score   support

      American Amber / Red Ale       0.31      0.25      0.28        20
           American Blonde Ale       0.11      0.07      0.09        14
            American Brown Ale       0.67      0.18      0.29        11
American Double / Imperial IPA       0.74      0.64      0.68        22
                  American IPA       0.76      0.75      0.75        63
       American Pale Ale (APA)       0.37      0.39      0.38        38
       American Pale Wheat Ale       0.10      0.08      0.09        13
               American Porter       0.14      0.08      0.10        13
        Fruit / Vegetable Beer       0.00      0.00      0.00         8
                        Kölsch       0.14      0.33      0.20         3
                         other       0.51      0.61      0.56       127

                      accuracy                           0.50       332
                     macro avg       0.35      0.31      0.31 

In [9]:
# Checking the Feature Importance

pd.crosstab(y_test, y_pred, rownames=['Actual Result'], colnames=['Predicted Result'])

Predicted Result,American Amber / Red Ale,American Blonde Ale,American Brown Ale,American Double / Imperial IPA,American IPA,American Pale Ale (APA),American Pale Wheat Ale,American Porter,Fruit / Vegetable Beer,Kölsch,other
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
American Amber / Red Ale,5,0,0,1,2,3,1,0,0,0,8
American Blonde Ale,0,1,0,0,0,1,1,0,1,0,10
American Brown Ale,1,0,2,0,1,0,0,0,0,0,7
American Double / Imperial IPA,0,0,0,14,0,0,0,0,0,0,8
American IPA,1,0,0,1,47,9,0,0,0,0,5
American Pale Ale (APA),2,0,0,0,4,15,2,1,0,0,14
American Pale Wheat Ale,0,0,1,0,1,0,1,0,0,2,8
American Porter,1,1,0,0,0,2,0,1,0,0,8
Fruit / Vegetable Beer,1,0,0,0,0,0,1,0,0,0,6
Kölsch,1,0,0,0,0,0,0,0,0,1,1


In [10]:
list(zip(X_train, rfc.feature_importances_))

[('abv', 0.4244978446799889), ('ibu', 0.575502155320011)]