# Feature Engineering

### Import Libraries

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#feature engineering
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

#Model Creation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from warnings import simplefilter
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

### Import data

In [2]:
final_df = pd.read_csv('data/final_df.csv',index_col = 0)

In [3]:
final_df.dtypes

Season                     int64
Week_Number                int64
Week_Name                 object
Baker                     object
Gender                    object
Age                        int64
Signature_Handshake        int64
Technical_Rank             int64
Showstopper_Handshake      int64
Favorite                 float64
Least_Favorite             int64
Star_Baker                 int64
Eliminated                 int64
Winner                     int64
signature                 object
showstopper               object
dtype: object

### Create a table of winners for future reference

In [4]:
Table_of_Winners = final_df.loc[final_df.Winner == 1]

In [5]:
Table_of_Winners = Table_of_Winners[['Season','Baker']].drop_duplicates()

In [6]:
Table_of_Winners.to_csv('data/Table_of_Winners.csv')

After understanding my data better:
- It seems the winner decided by whoever wins the final star baker. Therefore I will remove Winner from my dataset and set the target feature to Star Baker.
- Week_Name is redundant with Week_Number. I'll remove Week_Number
- We don't need the names of the bakers.
- Lastly, we will remove the name of the season, because older seasons won't show up in future datasets

### Drop Week_Number, Baker, Winner

In [7]:
df = final_df.drop(columns=['Week_Number','Baker','Winner','Season'])

In [8]:
df = df.astype({'Week_Name':'category',
                       'Gender':'category'})
df.dtypes

Week_Name                category
Gender                   category
Age                         int64
Signature_Handshake         int64
Technical_Rank              int64
Showstopper_Handshake       int64
Favorite                  float64
Least_Favorite              int64
Star_Baker                  int64
Eliminated                  int64
signature                  object
showstopper                object
dtype: object

In [101]:
df.head()

Unnamed: 0,Week_Name,Gender,Age,Signature_Handshake,Technical_Rank,Showstopper_Handshake,Favorite,Least_Favorite,Star_Baker,Eliminated,signature,showstopper
0,Cake,M,31,0,2,0,0.0,0,0,0,Rhubarb and Custard CupcakesAfter Dinner Cupcakes,Vanilla Sponge with White Chocolate and Raspbe...
1,Cake,F,31,0,1,0,0.0,0,1,0,Cherry Bakewell Inspired CupcakesGinger Cupcak...,Novelty Farm Cake
2,Cake,M,40,0,10,0,0.0,0,0,0,Apple and Cinnamon Cupcakes,Unknown
3,Cake,F,63,0,8,0,0.0,0,0,0,Raspberry and Cream Cupcakes,Chocolate Marble Cakewith Dark and White Choco...
4,Cake,M,19,0,6,0,0.0,0,0,0,Lemon Meringue CupcakesApple and Cinnamon Cupc...,Unknown


### Get my split ready

In [10]:
X = df.drop(columns='Star_Baker')
y= df['Star_Baker'].values

In [11]:
#is our class unbalanced?
#we should use SMOTE for class balance
df.Star_Baker.value_counts()

0    576
1     88
Name: Star_Baker, dtype: int64

In [100]:
#test_size of 30%.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.30, random_state=0,stratify = y)

#create unique lists for our different transformed columns
num_cols = list(X_train.select_dtypes(include='number').columns)
cat_cols = list(X_train.select_dtypes(include='category').columns)
sig_text = ('signature')
show_text = ('showstopper')

#add the transformers
preprocessor = ColumnTransformer( 
    transformers = [
        
        ('num',StandardScaler(),num_cols),                         #scale the numerical values
        ('cat',OneHotEncoder(handle_unknown = 'ignore'),cat_cols), #encode the categorical features
        ('text_sig',TfidfVectorizer(max_features=50),sig_text),    #freq counts for words
        ('text_show',TfidfVectorizer(max_features=50),show_text)   #freq counts for words
    ], remainder='passthrough')


#initialize the SMOTE class
smt = SMOTE(random_state=0)

pipeline = Pipeline( #using imblearn.pipeline
    [('preprocess',preprocessor),
     ('smote',smt),
     ('RandomForestClassifier',RandomForestClassifier(n_estimators = 300, random_state = 1, n_jobs=-1))
    ]
)


X_train_trans = pipeline.fit(X_train,y_train)
ypred=pipeline.predict(X_test)

print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94        31
           1       1.00      0.33      0.50         6

    accuracy                           0.89        37
   macro avg       0.94      0.67      0.72        37
weighted avg       0.90      0.89      0.87        37

