In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.decomposition import PCA

import gender_guesser.detector as gender

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_recall_fscore_support

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# READ DATA

In [None]:
df = pd.read_csv('data/train.csv')
pd.set_option('display.max_columns', len(df.columns))
df.head()

In [None]:
print(f"Total number of observations is {len(df)}")

# MISSING DATA

In [None]:
msno.matrix(df)

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
df = df.dropna()
print(f"New number of observations is {len(df)}")

# DATA CLEANING

In [None]:
df['Transported'] = df['Transported'].astype(int)

In [None]:
def get_num_cat_var(df):
    num_var = list(df._get_numeric_data().columns)
    cat_var = list(set(df.columns) - set(num_var))
    
    return num_var, cat_var

num_var, cat_var = get_num_cat_var(df)

In [None]:
df[num_var].nunique().sort_values(ascending=False)

In [None]:
df[cat_var].nunique().sort_values(ascending=False)

In [None]:
df['Number'] = df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
df = df.drop(['PassengerId'], axis=1)
num_var, cat_var = get_num_cat_var(df)

In [None]:
df[cat_var].nunique().sort_values(ascending=False)

In [None]:
d = gender.Detector()
df['Gender'] = df['Name'].apply(lambda x: d.get_gender(x.split()[0]))
df = df.drop(['Name'], axis=1)
num_var, cat_var = get_num_cat_var(df)

In [None]:
df[cat_var].nunique().sort_values(ascending=False)

In [None]:
df['Cabin deck'] = df['Cabin'].apply(lambda x: x.split('/')[0])
df['Cabin side'] = df['Cabin'].apply(lambda x: x.split('/')[2])
df = df.drop(['Cabin'], axis=1)
num_var, cat_var = get_num_cat_var(df)

In [None]:
df[cat_var].nunique().sort_values(ascending=False)

In [None]:
df[num_var].nunique().sort_values(ascending=False)

# CORRELATION

In [None]:
corr_mat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_mat, vmax=1, vmin=-1)

# DATA EXPLORATION

In [None]:
df['Transported'].value_counts()

In [None]:
df['Transported'].value_counts().sort_values().plot(kind='bar')

In [None]:
for var in num_var:
    if var != 'Transported':
        f, ax = plt.subplots(figsize=(20, 5))
        fig = sns.histplot(data=df, x=var, bins=25, kde=True, hue='Transported')

In [None]:
for var in cat_var:
    if var != 'Transported':
        f, ax = plt.subplots(figsize=(20, 5))
        fig = sns.countplot(data=df, x=var, hue='Transported')

# DATA PREPARATION

In [None]:
pd.set_option('display.max_columns', len(df.columns))
df.head()

In [None]:
df = pd.get_dummies(df)
pd.set_option('display.max_columns', len(df.columns))
df.head()

In [None]:
y = df[['Transported']]
X = df.drop('Transported', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=df['Transported'], random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

X_train.head()

In [None]:
pca = PCA(n_components=2)
res = pca.fit_transform(X_train)

f, ax = plt.subplots(figsize=(12, 9))
sns.scatterplot(res[:,0], res[:,1], hue=y_train['Transported'])

# MACHINE LEARNING

## Random Forest

In [None]:
rfr = RandomForestClassifier(random_state=42)

param_grid = { 
    'max_depth' : list(np.linspace(2, 10, 10, dtype=int)),
    'n_estimators': list(np.linspace(100, 500, 10, dtype=int))
}

cv_rfc = RandomizedSearchCV(rfr, param_grid, n_iter=5, verbose=1)
cv_rfc.fit(X_train, y_train.values.ravel())

cv_rfc.best_params_

In [None]:
y_pred = cv_rfc.predict(X_test)
precision_rfc, recall_rfc, f1_score_rfc, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f'F1 score: {f1_score_rfc:.3f} | Precision: {precision_rfc:.3f} | Recall: {recall_rfc:.3f}')

In [None]:
plt.rcParams["figure.figsize"] = (20, 5)

fi_rfc = pd.DataFrame(columns=['Feature', 'Importance'])
fi_rfc.Feature = X.columns
fi_rfc.Importance = cv_rfc.best_estimator_.feature_importances_
sorted_fi_rfc = fi_rfc.sort_values(by='Importance', ascending=False).head(10)

sorted_fi_rfc.plot.bar(x='Feature', y='Importance', rot=0)

In [None]:
xgbc = XGBClassifier(random_state=42)

param_grid = {
    'max_depth' : list(np.linspace(3, 11, 5, dtype=int)),
    'n_estimators': list(np.linspace(100, 500, 5, dtype=int)),
    'learning_rate': list(np.linspace(0.01, 0.3, 5)),
    'colsample_bytree': list(np.linspace(0.5, 1, 5)),
    'subsample': list(np.linspace(0.6, 1, 5)),
}

cv_xgbc = RandomizedSearchCV(xgbc, param_grid, n_iter=5, verbose=1)
cv_xgbc.fit(X_train, y_train.values.ravel())

cv_xgbc.best_params_

In [None]:
y_pred = cv_xgbc.predict(X_test)
precision_xgbc, recall_xgbc, f1_score_xgbc, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f'F1 score: {precision_xgbc:.3f} | Precision: {recall_xgbc:.3f} | Recall: {f1_score_xgbc:.3f}')

In [None]:
plt.rcParams["figure.figsize"] = (20, 5)

fi_xgbc = pd.DataFrame(columns=['Feature', 'Importance'])
fi_xgbc.Feature = X.columns
fi_xgbc.Importance = cv_xgbc.best_estimator_.feature_importances_
sorted_fi_xgbc = fi_xgbc.sort_values(by='Importance', ascending=False).head(10)

sorted_fi_xgbc.plot.bar(x='Feature', y='Importance', rot=0)