# This notebook consists of steps to use RFC model to create a pipeline

In [78]:
from sklearn.pipeline import Pipeline

cat_cols = ['brewery_name']
num_cols = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']
target_col = ['beer_style']

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [79]:
from sklearn.preprocessing import StandardScaler

num_transformer = Pipeline (
    steps =[
        ('ScaleNumVals', StandardScaler())
    ]
)

from app.feature_encode import BeerStyleCode

beer_style_transformer = Pipeline (
    steps = [
        ('EncodeBeerStyle', BeerStyleCode())
    ]
)

In [81]:
from feature_encode import BreweryNameEncodedVal

brew_name_transformer = Pipeline (
    steps = [
        ('EncodeBreweryName', BreweryNameEncodedVal())
    ]
)

In [82]:
from sklearn.compose import ColumnTransformer

In [83]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numColTranformer', num_transformer, num_cols),
        ('catColTransformer', brew_name_transformer, cat_cols)
    ]
)

In [84]:
from sklearn.ensemble import RandomForestClassifier

rfc_pipeline = Pipeline (
    steps = [
        ('PreProcess', preprocessor),
        ('RFCModel', RandomForestClassifier(n_estimators=104, max_samples=300))
    ]
)

In [85]:
#Load the unencoded file and split this into Test, Train and validate sets
import pandas as pd
import numpy as np

unencoded_datafile = "../data/interim/source_unencoded.csv"
df_raw = pd.read_csv(unencoded_datafile)
df_raw.drop(labels=["Unnamed: 0"], axis=1, inplace=True) #Drop the extra column that came across from the CSV file
df_raw.reindex

<bound method DataFrame.reindex of                         brewery_name  review_aroma  review_appearance  \
0                    Vecchio Birraio           2.0                2.5   
1                    Vecchio Birraio           2.5                3.0   
2                    Vecchio Birraio           2.5                3.0   
3                    Vecchio Birraio           3.0                3.5   
4            Caldera Brewing Company           4.5                4.0   
...                              ...           ...                ...   
1586587  The Defiant Brewing Company           4.0                3.5   
1586588  The Defiant Brewing Company           5.0                2.5   
1586589  The Defiant Brewing Company           3.5                3.0   
1586590  The Defiant Brewing Company           4.5                4.5   
1586591  The Defiant Brewing Company           4.5                4.5   

         review_palate  review_taste                      beer_style  
0                

In [86]:
from feature_encode import BeerStyleCode

bs = BeerStyleCode()
df_raw["beer_style"]=bs.transform(df_raw["beer_style"])

In [87]:
from src.data.sets import split_sets_random, save_sets
X_train, Y_train, X_valid, Y_valid, X_test, Y_test = split_sets_random(df_raw, target_col="beer_style")

In [88]:
# I need to train a model based on a pipeline - so I need X and Y values
# Y values do not have to be encoded once the model is functional, as they will only be decoded
# so to train the model, I'll provided X values that require encoding and Y values that are already encoded
# I will dump the model once trained. and figure out Y value decoding later on

In [89]:
pipeline_model = rfc_pipeline.fit(X_train, Y_train)

In [90]:
Y_test_preds = pipeline_model.predict(X_test)

In [91]:
import joblib

joblib.dump(pipeline_model, "../models/pipeline_model_rfc.joblib")

['../models/pipeline_model_rfc.joblib']