# Machine Learning Utils - Classification example

###### Titanic dataset (Kaggle)  https://www.kaggle.com/hesh97/titanicdataset-traincsv

### 0 - Setup

In [None]:
from ml_utils import *
import warnings
warnings.filterwarnings("ignore")

In [None]:
dtf = pd.read_csv('data_titanic.csv')
dtf.head()

In [None]:
check_Nas(dtf, plot="map")

### 1 - Data Analysis

###### Group variables by info
- pk: PassengerId, Name
- y: Survived
- who: Sex, Age, Embarked (which port C=Cherbourg, Q=Queenstown, S=Southampton)
- wealth: Pclass, Ticket, Fare
- where: Cabin
- how many: SibSp (with siblings/spouse), Parch (with parent/children)

In [None]:
features = []

###### Target Variable

In [None]:
dtf = dtf.rename(columns={"Survived":"Y"})

freqdist_plot(dtf, "Y", figsize=(5,3))

In [None]:
#-> Population mean: 38% of the passengers survived

###### Who? Sex, Age, Embarked

In [None]:
#--- Sex ----#
bivariate_plot(dtf, x="Sex", y="Y", figsize=(10,5))

In [None]:
#-> Sex is Predictive: the surviving rate of females is higher.
features.append("Sex")

In [None]:
#--- Age ---#
nan_analysis(dtf, na_x="Age", y="Y", max_cat=20, figsize=(10,5))

In [None]:
freqdist_plot(dtf, "Age", box_logscale=True, figsize=(10,5))

In [None]:
dtf["Age"] = dtf["Age"].fillna( dtf["Age"].mean() )

In [None]:
bivariate_plot(dtf, x="Age", y="Y", figsize=(15,5))

In [None]:
#-> Age is Predictive: the Surviving rate is higher for younger passengers, there is a spike in the left tail of Y=1 
# distribution and the first bin of Age (0-16) contains the highest percentage of survived people.
features.append("Age")

In [None]:
#--- Embarked ---#
bivariate_plot(dtf, x="Embarked", y="Y", figsize=(10,5))

In [None]:
coeff, p = test_corr(dtf, x="Embarked", y="Y")

In [None]:
#-> Embarked is Predictive: People from port C tend to survive better (that can be because they stayed in a fortunate area
# of the ship or just because they're smarter). Since there aren't many observations, I tested the significance 
# of the correlation (Cramer cat vs cat), it passed.
features.append("Embarked")

###### Wealth? Pclass, Ticket, Fare

In [None]:
#--- Pclass ---#
bivariate_plot(dtf, x="Pclass", y="Y", figsize=(10,5))

In [None]:
#-> Pclass is Predctive: the richer the higher the probability of surviving.
features.append("Pclass")

In [None]:
#--- Ticket ---#
freqdist_plot(dtf, "Ticket", top=10, figsize=(5,3))

In [None]:
#-> Ticket is Useless

In [None]:
#--- Fare ---#
bivariate_plot(dtf, x="Fare", y="Y", figsize=(15,5))

In [None]:
#-> Fare is Predictive: it gives the same info of Pclass
bivariate_plot(dtf, x="Fare", y="Pclass", figsize=(15,5))

In [None]:
cross_distributions(dtf, x1="Pclass", x2="Fare", y="Survived", figsize=(10,5))

In [None]:
## Looks there is more information in the first class: who paid higher price survived better.
## I will keep it for now and exclude one of the two in the Features Selection section.
features.append("Fare")

###### Where? Cabin

In [None]:
# Cabin
freqdist_plot(dtf, "Cabin", top=10, figsize=(5,3))

In [None]:
## Useless like this, let's see if the variable can be clustered using the first letter of the cabin:
dtf["Cabin_section"] = dtf["Cabin"].apply(lambda x: str(x)[0])
freqdist_plot(dtf, "Cabin_section", top=10, figsize=(5,3))

In [None]:
cross_distributions(dtf, x1="Cabin_section", x2="Pclass", y="Survived", figsize=(10,5))

###### How many? SibSp, Parch

In [None]:
bivariate_plot(dtf, x="SibSp", y="Survived", figsize=(15,5))

In [None]:
features.append("SibSp")

In [None]:
bivariate_plot(dtf, x="Parch", y="Survived", figsize=(15,5))

In [None]:
features.append("Parch")

###### Summary

In [None]:
sns.scatterplot(data=dtf, x="Age", y="Fare", hue="Survived", style="Sex")

In [None]:
g = sns.FacetGrid(dtf, col="Sex",  row="Pclass", hue="Survived")
g.map(plt.plot, "Age", "Fare", marker=".").add_legend()

In [None]:
dtf = dtf[["PassengerId"]+features+["Y"]]
dtf.head()

### 2 - Preprocessing

In [None]:
check = data_preprocessing(dtf, pk="PassengerId", y="Survived", task="classification",
                           processNas=None, processCategorical=None, split=None, scale=None)

###### NAs

In [None]:
dtf = dtf[dtf["Embarked"].notnull()]
print(dtf.shape)

###### Categorical

In [None]:
dtf = add_dummies(dtf, x="Embarked", dropx=True, dummy_na=False)

In [None]:
dtf = add_dummies(dtf, x="Sex", dropx=True, dummy_na=False)

###### Partitioning + Scaling

In [None]:
dic_data = data_preprocessing(dtf, pk="PassengerId", y="Survived", task="classification",
                              processNas=None, processCategorical=None, split=0.3, scale="standard")

X_names = dic_data["X_names"]
X_train, X_test = dic_data["X"]
Y_train, Y_test = dic_data["Y"]
scaler, _ = dic_data["scaler"]

### 3 - Baseline (xgboost)

###### Features Selection

###### Train

In [None]:
baseline = ensemble.GradientBoostingClassifier()

In [None]:
dic_model = fit_classif_model(baseline, X_train, Y_train, X_test, Y_test, Y_threshold=0.5)

predicted_prob, predicted = dic_model["predicted_prob"], dic_model["predicted"]

###### Evaluate

In [None]:
evaluate_model(Y_test, predicted, predicted_prob, figsize=(20,5))

### 4 - Model Desing & Testing (neural network)

###### Features Selection

In [None]:
dic_feat_sel = features_selection(dtf, y="Survived", top=5, figsize=(10,5))

In [None]:
dic_feat_sel

In [None]:
dtf = dtf[["PassengerId"]+dic_feat_sel["join"]+["Survived"]]

In [None]:
dic_data = data_preprocessing(dtf, pk="PassengerId", y="Survived", task="classification",
                              processNas=None, processCategorical=None, split=0.3, scale="standard")

X_names = dic_data["X_names"]
X_train, X_test = dic_data["X"]
Y_train, Y_test = dic_data["Y"]
scaler, _ = dic_data["scaler"]

###### Train

In [None]:
param_dic = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001],      #weighting factor for the corrections by new trees when added to the model
             'n_estimators':[100,250,500,750,1000,1250,1500,1750],  #captures the number of trees that we add to the model
             'max_depth':[2,3,4,5,6,7],                             #maximum depth of the tree
             'min_samples_split':[2,4,6,8,10,20,40,60,100],         #sets the minimum number of samples to split
             'min_samples_leaf':[1,3,5,7,9],                        #the minimum number of samples to form a leaf
             'max_features':[2,3,4,5,6,7],                          #square root of features is usually a good starting point
             'subsample':[0.7,0.75,0.8,0.85,0.9,0.95,1]}            #the fraction of samples to be used for fitting the individual base learners. Values lower than 1 generally lead to a reduction of variance and an increase in bias.

In [None]:
model = model_tuning(X_train, Y_train, baseline, param_dic, scoring="accuracy", 
                     searchtype="RandomSearch", n_iter=1000, cv=10, figsize=(10,5))

In [None]:
dic_model = fit_classif_model(model, X_train, Y_train, X_test, Y_test, Y_threshold=0.5)

model, predicted_prob, predicted = dic_model["model"], dic_model["predicted_prob"], dic_model["predicted"]

###### Evaluate

In [None]:
evaluate_model(Y_test, predicted, predicted_prob, figsize=(20,5))