In [1]:
import numpy as np
import pandas

from sklearn.utils import resample

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

from sklearn.tree import export_graphviz

from sklearn.pipeline import Pipeline

from subprocess import call

# Sumário de Dados

ABV ou Alcohol by Volume é a porcentagem de álcool em uma cerveja (%).  
IBU ou International Bittering Unit é a medida de amargura de uma cerveja, é baseada na quantidade de lúpulo adicionado.  
OG ou Original Gravity é o valor da Specific Gravity antes do processo de fermentação.  
FG ou Final Gravity é o valor da Specific Gravity após o processo de fermentação.  
EBC ou European Brewery Convention refere-se a coloração da cerveja.
SRM ou Standard Reference Method é outra medida que refere-se a coloração da cerveja.  
PH: A medida de acidez final da cerveja sendo < 7 ácido e > 7 básico ou alcálino. (Range: 1-14)
  
  
OBS: Specific Gravity, às vezes chamada de Present Gravity, é a densidade de açucar da cerveja ou mosto na temperatura e pressão padrão (20°C, 760 mmHg) medida por sacarômetro, hidrômetro ou refratômetro. 

In [2]:
data_path = "./aws_beer_extracted_data.csv"

In [3]:
df = pandas.read_csv(data_path, sep = ",", header = "infer")
df.head()

Unnamed: 0,id,name,abv,ibu,target_fg,target_og,ebc,srm,ph,partition_0,partition_1,partition_2,partition_3
0,22,"""Devine Rebel (w/ Mikkeller)""",12.5,100.0,1030.0,1093.0,36.0,18.0,4.4,2021,2,10,3
1,227,"""Brewdog Vs Beavertown""",9.2,50.0,1085.0,1016.0,78.8,40.0,4.2,2021,2,10,6
2,99,"""Hello My Name Is Päivi""",8.3,70.0,1013.0,1076.0,30.0,15.0,4.4,2021,2,10,1
3,7,"""AB:12""",11.2,35.0,1017.0,1108.0,80.0,40.0,5.3,2021,2,10,2
4,76,"""#Mashtag 2013""",7.5,50.0,1013.0,1070.0,40.0,20.0,4.4,2021,2,10,9


In [4]:
drop = ["id", "name", "partition_0", "partition_1", "partition_2", "partition_3"]
df.drop(drop, axis = 1, inplace = True)
df.head()

Unnamed: 0,abv,ibu,target_fg,target_og,ebc,srm,ph
0,12.5,100.0,1030.0,1093.0,36.0,18.0,4.4
1,9.2,50.0,1085.0,1016.0,78.8,40.0,4.2
2,8.3,70.0,1013.0,1076.0,30.0,15.0,4.4
3,11.2,35.0,1017.0,1108.0,80.0,40.0,5.3
4,7.5,50.0,1013.0,1070.0,40.0,20.0,4.4


In [5]:
df.dropna(axis = 0, how = "any", inplace = True)
df.describe()

Unnamed: 0,abv,ibu,target_fg,target_og,ebc,srm,ph
count,147.0,147.0,147.0,147.0,147.0,147.0,147.0
mean,7.603401,60.612245,1014.115646,1067.040816,58.970068,29.814898,4.391156
std,3.237376,41.356361,10.052954,26.124805,101.637164,51.424279,0.43209
min,0.5,0.0,1000.0,1007.0,2.0,1.0,3.2
25%,5.2,35.0,1010.0,1047.5,14.5,7.25,4.25
50%,7.2,55.0,1012.0,1066.0,23.0,12.5,4.4
75%,9.0,75.0,1015.0,1078.5,40.0,20.0,4.4
max,18.3,250.0,1085.0,1130.0,600.0,305.0,5.3


In [6]:
#df = resample(df, replace = True, n_samples = 10000)

In [7]:
df.corr(method = "pearson")

Unnamed: 0,abv,ibu,target_fg,target_og,ebc,srm,ph
abv,1.0,0.252775,0.15663,0.798312,0.373883,0.372788,0.372395
ibu,0.252775,1.0,-0.052247,0.28732,0.054746,0.053265,0.306556
target_fg,0.15663,-0.052247,1.0,-0.021028,0.375967,0.374386,-0.149087
target_og,0.798312,0.28732,-0.021028,1.0,0.55731,0.556242,0.280988
ebc,0.373883,0.054746,0.375967,0.55731,1.0,0.999345,-0.010212
srm,0.372788,0.053265,0.374386,0.556242,0.999345,1.0,-0.014132
ph,0.372395,0.306556,-0.149087,0.280988,-0.010212,-0.014132,1.0


Split data between Features and Values

In [8]:
predicting_variable = 1 #ibu

# Features
X = df.iloc[:, [i for i in range(len(df.columns)) if i != predicting_variable]].to_numpy()
# Values (IBUs)
y = df.iloc[:, [i for i in range(len(df.columns)) if i == predicting_variable]].to_numpy().flatten()

Split data between training and test

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, shuffle = False)

Normalize Data

In [10]:
scaler = MinMaxScaler(feature_range = (0.01, 0.99), copy = False)
scaler.fit(X_train)
scaler.transform(X_train)
print(scaler.data_min_)
print(scaler.data_max_)

[5.000e-01 1.000e+03 1.007e+03 2.000e+00 1.000e+00 3.200e+00]
[  18.3 1085.  1130.   600.   305.     5.3]


In [11]:
model = RandomForestRegressor(n_estimators = 20)

model.fit(X_train, y_train)

model.score(X_train, y_train)

0.9169951281221571

In [13]:
export_graphviz(model.estimators_[0], out_file = "tree.dot")

In [19]:
call(["dot", "-Tpng", "./tree.dot", "-o", "./tree.png", "-Gdpi=600"])

Exception: File `'(["dot",.py'` not found.

Model Training

In [None]:
scaler.transform(X_test)
print("")

In [None]:
model.score(X_test, y_test)

In [None]:
for X, y in zip(X_test, y_test):
    
    y_predicted = model.predict([X])[0]
    print(y, y_predicted)