# Reto cerveza

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model  import LogisticRegression
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from xgboost               import XGBClassifier, plot_tree

from sklearn.preprocessing   import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics         import accuracy_score, balanced_accuracy_score

In [2]:
df_train = pd.read_csv("./datos/beer_train.csv")
df_test = pd.read_csv("./datos/beer_test.csv")

In [3]:
#df.head()

In [4]:
cat_train  = df_train.select_dtypes(include=[object]).columns
num_train  = df_train.select_dtypes(exclude=[object, 'datetime64','timedelta64']).columns
time_train = df_train.select_dtypes(include=['datetime64']).columns

print("\nNumerical features:\n", num_train.values)
print("\nCategorical features:\n", cat_train.values)
print("\nDate/time features:\n", time_train.values)


Numerical features:
 ['Id' 'Size(L)' 'OG' 'FG' 'ABV' 'IBU' 'Color' 'BoilSize' 'BoilTime'
 'BoilGravity' 'Efficiency' 'MashThickness' 'PitchRate' 'PrimaryTemp']

Categorical features:
 ['SugarScale' 'BrewMethod' 'Style']

Date/time features:
 []


In [5]:
#df_train.describe().T

In [6]:
#df_train.describe(include=['object']).T

In [7]:
#for col in num:
#    plot_num(df[col], col, zeros = False )

In [8]:
#for col in cat:
#    plot_cat(df[col],col)

In [9]:
# Columnas que tienen nulos
df_train.isna().sum(axis = 0)

Id                   0
Size(L)              0
OG                   0
FG                   0
ABV                  0
IBU                  0
Color                0
BoilSize             0
BoilTime             0
BoilGravity        936
Efficiency           0
MashThickness     9866
SugarScale           0
BrewMethod           0
PitchRate        13134
PrimaryTemp       7488
Style                0
dtype: int64

In [10]:
# Añado la columna para saber los no nulos
df_train["BoilGravityWasMissing"] = df_train["BoilGravity"].isna()
df_train["MashThicknessWasMissing"] = df_train["MashThickness"].isna()  
df_train["PitchRateWasMissing"] = df_train["PitchRate"].isna()
df_train["PrimaryTempWasMissing"] = df_train["PrimaryTemp"].isna()

In [11]:
# Hago lo mismo con los test
df_test["BoilGravityWasMissing"] = df_test["BoilGravity"].isna()
df_test["MashThicknessWasMissing"] = df_test["MashThickness"].isna()  
df_test["PitchRateWasMissing"] = df_test["PitchRate"].isna()
df_test["PrimaryTempWasMissing"] = df_test["PrimaryTemp"].isna()

In [12]:
#Sustituyo los NaN con la media
df_train["BoilGravity"].fillna(df_train["BoilGravity"].median(), inplace=True)
df_train["MashThickness"].fillna(df_train["MashThickness"].median(), inplace=True)
df_train["PitchRate"].fillna(df_train["PitchRate"].median(), inplace=True)
df_train["PrimaryTemp"].fillna(df_train["PrimaryTemp"].median(), inplace=True)

In [13]:
#Sustituyo lo mismo con los test
df_test["BoilGravity"].fillna(df_test["BoilGravity"].median(), inplace=True)
df_test["MashThickness"].fillna(df_test["MashThickness"].median(), inplace=True)
df_test["PitchRate"].fillna(df_test["PitchRate"].median(), inplace=True)
df_test["PrimaryTemp"].fillna(df_test["PrimaryTemp"].median(), inplace=True)

In [14]:
# Reemplazo las categorías con su valor numérico
# Pero recuerda que cat tiene tres columnas que hemos eliminado del data frame. Lo recalculo
cat_train  = df_train.select_dtypes(include=[object]).columns
cat_test  = df_test.select_dtypes(include=[object]).columns

In [15]:
# Codifico las tipo Características
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df_train['SugarScale'] = le.fit_transform(df_train['SugarScale'])
df_train['BrewMethod'] = le.fit_transform(df_train['BrewMethod'])
 
df_test['SugarScale'] = le.fit_transform(df_test['SugarScale'])
df_test['BrewMethod'] = le.fit_transform(df_test['BrewMethod'])

df_train['Style'] = le.fit_transform(df_train['Style'])
df_train['Style'].unique()

array([ 2,  8,  5,  0,  9,  4,  3,  7,  1, 10,  6], dtype=int64)

In [16]:

# Ahora defino lo que conozco, y lo que quiero descubrir, la y = Style
x_train = df_train.drop("Style", axis="columns")
y_train = df_train["Style"]

# Ahora pruebo todos

In [17]:
model = GradientBoostingClassifier()
#model = XGBClassifier(n_estimators=250)
model.fit(x_train, y_train)   

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=250, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [18]:
predictions = model.predict(df_test)

In [19]:
df_test["Style"] = le.inverse_transform(predictions)

In [20]:
df_test.head()

Unnamed: 0,Id,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,...,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,BoilGravityWasMissing,MashThicknessWasMissing,PitchRateWasMissing,PrimaryTempWasMissing,Style
0,0,20.82,15.6465,4.23184,6.21,55.13,8.2,28.01,90,11.8,...,1.3,0,0,1.0,25.56,False,False,False,False,American IPA
1,1,20.82,1.057,1.013,5.71,51.99,6.35,28.39,60,1.042,...,1.5,1,0,1.25,20.0,False,True,False,True,American Pale Ale
2,2,22.71,1.058,1.014,5.67,54.77,24.75,28.39,60,1.046,...,1.5,1,0,0.75,20.0,False,False,True,False,American Brown Ale
3,3,10.0,1.052,1.01,5.51,93.58,5.46,28.5,60,1.018,...,1.5,1,0,0.5,17.0,False,False,False,False,American IPA
4,4,20.0,12.0478,1.32023,5.7,33.72,3.58,30.0,60,8.1,...,3.0,0,0,0.75,27.0,False,False,True,False,Saison


In [21]:
submission = df_test[["Id", "Style"]]

In [22]:
submission.head()

Unnamed: 0,Id,Style
0,0,American IPA
1,1,American Pale Ale
2,2,American Brown Ale
3,3,American IPA
4,4,Saison


In [23]:
submission.to_csv("./datos/XGBClassifier.csv", index=False)