In [88]:
import pandas as pd

IPA = pd.read_csv("ipa.csv")

IPA.head()

Unnamed: 0,IsIPA,OG,FG,ABV,IBU,Color,BoilTime,BoilGravity,PitchRate,Efficiency,UserId
0,False,1.069,1.007,8.12,0.0,30.48,60,,,75.0,
1,False,1.064,1.012,6.8,9.36,9.85,60,1.132,0.5,35.0,
2,False,1.061,1.015,6.08,28.31,35.83,60,1.044,0.35,83.0,42087.0
3,False,1.053,1.012,5.44,46.48,5.77,60,1.033,,70.0,
4,False,1.053,1.017,4.64,42.29,4.22,90,1.039,0.5,77.0,14729.0


In [89]:
IPA.shape

(37000, 11)

Issues with IMBLearn, don't have time to fix. Won't resample the dataset properly.

Will use a decision tree instead of earlier intended RL.

In [90]:
IPA.isnull().mean() * 100

IsIPA           0.000000
OG              0.000000
FG              0.000000
ABV             0.000000
IBU             0.000000
Color           0.000000
BoilTime        0.000000
BoilGravity     3.589189
PitchRate      53.094595
Efficiency      0.000000
UserId         68.778378
dtype: float64

In [91]:
IPA = IPA.drop(columns = ["PitchRate", "UserId"])
IPA = IPA.dropna()
IPA.shape

(35672, 9)

In [92]:
X = IPA.drop(columns = "IsIPA")
y = IPA.IsIPA.astype(int)

X.shape, y.shape

((35672, 8), (35672,))

In [24]:
y.value_counts()

0    24273
1    11399
Name: IsIPA, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
y_train.head()

36549    1
17426    0
33107    0
14195    0
29175    0
Name: IsIPA, dtype: int32

#### Model
Let's use some simple tree model from tutorials.

In [26]:
from sklearn import tree

In [28]:
CART = tree.DecisionTreeRegressor(random_state = 2137, ccp_alpha = 0.0)
CART_model = CART.fit(X_train, y_train)

In [29]:
CART_model.get_depth(), CART_model.get_n_leaves()

(37, 3483)

In [31]:
def RMSE(model, X, y):
    return np.sqrt(((model.predict(X) - y)**2).mean())#But we can view this as multiclass classification

In [34]:
path = CART.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas[::5], path.impurities[::5]

clfs = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeRegressor(random_state = 42, ccp_alpha = ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
for order, ind in [('First', 0), ('Last', -1)]:
    print(f"{order} tree with ccp_alpha: {ccp_alphas[ind]:.2f}, " +
          f"nodes: {clfs[ind].tree_.node_count}, leaves: {clfs[ind].get_n_leaves()}")

First tree with ccp_alpha: 0.00, nodes: 6929, leaves: 3465
Last tree with ccp_alpha: 0.00, nodes: 9, leaves: 5


In [37]:
test_scores = [RMSE(clf, X_test, y_test) for clf in clfs]
train_scores = [RMSE(clf, X_train, y_train) for clf in clfs]
min(train_scores), min(test_scores)

(0.0, 0.32006078384515074)

In [39]:
import numpy as np
Best_CART = clfs[np.argmin(test_scores)]
Best_CART.ccp_alpha

0.00013750284281974158

In [40]:
confmat = pd.crosstab(Best_CART.predict(X_test).round(), y_test.round())
confmat

IsIPA,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,5445,684
1.0,552,2237


## Accuracy on test data

In [42]:
print(f"The model's accuracy is {np.array([confmat.loc[i,i] for i in confmat.index]).sum()/confmat.sum().sum()}")

The model's accuracy is 0.8614039022202288


# Getting predictions:

In [101]:
IPA_test = pd.read_csv("IPA_test.csv")
IPA_test.head()

Unnamed: 0,OG,FG,ABV,IBU,Color,BoilTime,BoilGravity,PitchRate,Efficiency,UserId
0,1.045,1.008,4.78,27.81,4.65,60,1.037,0.5,76.0,
1,1.052,1.01,5.56,35.98,12.9,60,1.041,0.5,80.0,56565.0
2,1.079,1.021,7.64,64.83,41.03,75,1.058,1.0,70.0,15163.0
3,1.06,1.018,5.47,16.45,16.55,60,1.044,0.35,70.0,
4,1.052,1.013,5.03,37.03,44.77,60,1.038,,70.0,14759.0


In [102]:
X2 = IPA_test.drop(columns = ["PitchRate", "UserId"])
X2.head()

Unnamed: 0,OG,FG,ABV,IBU,Color,BoilTime,BoilGravity,Efficiency
0,1.045,1.008,4.78,27.81,4.65,60,1.037,76.0
1,1.052,1.01,5.56,35.98,12.9,60,1.041,80.0
2,1.079,1.021,7.64,64.83,41.03,75,1.058,70.0
3,1.06,1.018,5.47,16.45,16.55,60,1.044,70.0
4,1.052,1.013,5.03,37.03,44.77,60,1.038,70.0


In [103]:
X2.isnull().mean() * 100, X2.shape

(OG             0.00
 FG             0.00
 ABV            0.00
 IBU            0.00
 Color          0.00
 BoilTime       0.00
 BoilGravity    3.12
 Efficiency     0.00
 dtype: float64,
 (5000, 8))

In [104]:
X2 = X2.fillna(X2.mean())
X2.isnull().mean() * 100, X2.shape

(OG             0.0
 FG             0.0
 ABV            0.0
 IBU            0.0
 Color          0.0
 BoilTime       0.0
 BoilGravity    0.0
 Efficiency     0.0
 dtype: float64,
 (5000, 8))

In [105]:
np.unique(Best_CART.predict(X2).round(),return_counts = True)

(array([0., 1.]), array([3460, 1540], dtype=int64))

In [110]:
y2 = Best_CART.predict(X2).round()
type(y2)

numpy.ndarray

In [111]:
y2 = pd.DataFrame(Y2)
y2

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
4995,1.0
4996,0.0
4997,0.0
4998,0.0


In [112]:
y2.to_csv("[Decision Trees Or The Unexpected Virtue Of Ignorance]_IPA_prediction.csv", index = False, header = False)