# Kaggle: Titanic Dataset (Classification Problem)

Install kaggle from pypi, upload kaggle.json with api key and build kaggle directory.

In [None]:
!pip install -q kaggle
from google.colab import files, drive
files.upload()
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c titanic
!mkdir titanic
!unzip titanic.zip -d titanic


# Download dataset and pip install necessary packages

In [None]:
!pip install autoviz
!pip install BorutaShap
!pip install featurewiz
!pip install auto-sklearn
!pip install lazypredict

In [None]:
!pip install scikit-optimize


# Import packages and set matplotlib styling

In [None]:
import pandas as pd
from pandas import Index
import numpy as np
import random
from sklearn.preprocessing import Normalizer, LabelEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.covariance import EllipticEnvelope
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from typing import List, Tuple
from autoviz import data_cleaning_suggestions, AutoViz_Class
from xgboost import XGBClassifier
from BorutaShap import BorutaShap
from featurewiz import FeatureWiz
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV

%matplotlib inline

import matplotlib.pylab as pylab
plt.style.use('seaborn-notebook')
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)
sns.set_style('dark')
pylab.rcParams['figure.figsize'] = 12, 8


# Set our random seed and read our csvs

In [None]:
### SET DEFAULTS ###
seed = 42
np.random.seed(seed)
random.seed(seed)

### READ DATA ###
train = pd.read_csv('/content/titanic/train.csv')
test = pd.read_csv('/content/titanic/test.csv')
submission_template = pd.read_csv('/content/titanic/gender_submission.csv')

# Get data cleaning suggestions

Using autoviz package, we can easily get data cleaning suggestions

In [None]:
### GET DATA CLEANING SUGGESTIONS ###

data_cleaning_suggestions(train)


Data cleaning improvement suggestions. Complete them before proceeding to ML modeling.


Unnamed: 0,Nuniques,dtype,Nulls,Nullpercent,NuniquePercent,Value counts Min,Data cleaning improvement suggestions
PassengerId,891,int64,0,0.0,100.0,0,possible ID column: drop
Name,891,object,0,0.0,100.0,1,"combine rare categories, possible ID column: drop"
Ticket,681,object,0,0.0,76.43,1,combine rare categories
Fare,248,float64,0,0.0,27.83,0,skewed: cap or drop outliers
Cabin,147,object,687,77.1,16.5,1,"combine rare categories, fill missing, fix mixed data types"
Age,88,float64,177,19.87,9.88,0,fill missing
SibSp,7,int64,0,0.0,0.79,0,
Parch,7,int64,0,0.0,0.79,0,
Pclass,3,int64,0,0.0,0.34,0,
Embarked,3,object,2,0.22,0.34,77,"fill missing, fix mixed data types"


# Run AutoViz

In [None]:
### AUTOVIZ OUR DATA TO GET IDEAS ###

%matplotlib inline

dft = AutoViz_Class().AutoViz(
    filename="",
    depVar="Survived",
    dfte=train,
    verbose=0,
)


# Data wrangling funcs

- Drop irrelevant columns (thanks autoviz!)
- fill missing data:
  - fill categorical features with the mode
  - fill numerical features with a prediction made by iterative imputer
- Eliminate anomalies
  - EllipticEnvelope to detect outliers
  - KNNImputer to fill outliers

- Wrangle!
  - Return the wrangled dataframe and the target variable column if handling the train df
  

In [None]:
def drop_cols(df: pd.DataFrame, cols_to_drop: List[str]) -> pd.DataFrame:
  print(f'Feature: dropping {cols_to_drop}')
  raw = df.copy(deep=True)
  raw.drop(labels=cols_to_drop, axis=1, inplace=True)
  print(f'Feature: {cols_to_drop} were dropped')
  return raw

def fill_missing_data(df: pd.DataFrame, cat_features: List[str], num_features: List[str], fill: List[str]) -> pd.DataFrame:
  raw = df.copy(deep=True)
  for col in fill:
    if col in cat_features:
        data = raw[col].values.reshape(-1, 1)
        raw[col] = SimpleImputer(strategy='most_frequent').fit_transform(data)
        print(f'Feature: {col} filled with mode')
    if col in num_features:
        data = raw[col].values.reshape(-1, 1)
        estimator = RandomForestRegressor(n_estimators=4, max_depth=10, 
                                          bootstrap=True, max_samples=0.5, 
                                          n_jobs=2, random_state=seed)
        raw[col] = IterativeImputer(random_state=seed, estimator=estimator, 
                                    max_iter=25, tol=1e-1).fit_transform(data)
        print(f'Feature: {col} filled with iterative imputer')
  return raw

def handle_anomalies(df: pd.DataFrame, anomalous_cols: List[str]) -> pd.DataFrame:
  """
  Used ellipticenvelope as a suggestion from a peer. I have no idea really how it works!
  """
  raw = df.copy(deep=True)
  for col in anomalous_cols:
    data = raw[col].values.reshape(-1, 1)
    ee = EllipticEnvelope()
    knn_imputer = KNNImputer()
    outliers_predict = ee.fit(data).predict(data)
    raw[col][np.array(np.where(outliers_predict == -1)).ravel()] = np.nan
    data = raw[col].values.reshape(-1, 1)
    raw[col] = knn_imputer.fit_transform(data)
    print(f'Feature: Outliers in {col} eliminated')
  return raw

def wrangle(df: pd.DataFrame, drops: List[str], fills: List[str], cat_feats: List[str], 
            num_feats: List[str], anomalies: List[str], target: str, 
            mode: str='train') -> Tuple[pd.DataFrame, pd.Series]:
  raw = df.copy(deep=True)
  df_dropped = drop_cols(raw, drops)
  print(f'Feature: {drops} were dropped')
  df_no_na = fill_missing_data(df_dropped, cat_feats, num_feats, fills)
  print(f'Feature: Filled missing data in {cat_feats} and {num_feats}')
  df_no_anomalies = handle_anomalies(df_no_na, anomalies)
  print(f'Feature: Eliminated outliers in {anomalies}')
  target = df_no_anomalies.pop(target) if mode == 'train' else None
  print(f'\n========== {mode} cleaning complete ==========\n')
  return df_no_anomalies, target


# Wrangling

- **Drop**: ID-type data
- **Fill**: Age, Embarked, Fare
  - These columns have potentially relevant data that we want to impute
- **Anomalies**: 
  - Based on AutoViz, we can see that Fare data has anomalies and appears to be normally distributed, thus we use EllipticEnvelope to detect and fix outliers. Fare data is potentially highly relevant to our prediction
- cat_features: categorical features
- num_features: discrete and continuous features
- target: Survived

- wrangle both our X_train and X_test 


In [None]:
to_drop = [
    'PassengerId',
    'Ticket'
    ]

to_fill = [
    'Age',
    'Embarked',
    'Fare'
]

anomalies = [
    'Fare'
]

cat_features = [
    'Embarked',
    'Sex'
]

num_features = [
    'Fare',
    'Age',
    'SibSp',
    'Parch',
    'Pclass'
]

# num_features = list(train.select_dtypes(['number']).columns)
# cat_features = list(train.select_dtypes(['object']).columns)

target = 'Survived'
X_train, y_train = wrangle(train, to_drop, to_fill, cat_features, num_features, anomalies, target)
X_test, _ = wrangle(test, to_drop, to_fill, cat_features, num_features, anomalies, target, mode='test')


Feature: dropping ['PassengerId', 'Ticket']
Feature: ['PassengerId', 'Ticket'] were dropped
Feature: ['PassengerId', 'Ticket'] were dropped
Feature: Age filled with iterative imputer
Feature: Embarked filled with mode
Feature: Fare filled with iterative imputer
Feature: Filled missing data in ['Embarked', 'Sex'] and ['Fare', 'Age', 'SibSp', 'Parch', 'Pclass']
Feature: Outliers in Fare eliminated
Feature: Eliminated outliers in ['Fare']


Feature: dropping ['PassengerId', 'Ticket']
Feature: ['PassengerId', 'Ticket'] were dropped
Feature: ['PassengerId', 'Ticket'] were dropped
Feature: Age filled with iterative imputer
Feature: Embarked filled with mode
Feature: Fare filled with iterative imputer
Feature: Filled missing data in ['Embarked', 'Sex'] and ['Fare', 'Age', 'SibSp', 'Parch', 'Pclass']
Feature: Outliers in Fare eliminated
Feature: Eliminated outliers in ['Fare']




# Feature Engineering

## Hypothesis: Wealth -> survival

### map_cabin:
  - Does this passenger have a cabin? 

### get_family_features:
  - How big is this person's family?
  - Is this passenger alone?

### get_title_features:
  - using regex, we extract passenger titles
  - Group upper class to 'noble' (we also have a ticket class, but this way we can further separate classes, as some upper class passengers may be much wealthier than others)
  - Further group similar titles (i.e. Miss -> Ms)
  - Encode these groups of titles

### get_bands: 
- Based on a set of bins, our goal is to create a bunch of integer labels from our continuous variables
- create labels using FunctionTransformer on pd.cut, and encode them to int64 type.
- Note: Labels are not applied in order (i.e. the bin of 15 < Age <= 30 is mapped to 0) probably due to our transformation. Might make more sense in the future to just use .map()
- Age:
  - Age is a continuous variable, so we want to turn it into a discrete variable.
- Fare:
  - Fare is continous as well

### get_gender:
- Create a gender dummy variable

### map_embarked:
  - Map the port this passenger embarked from using a label encoder

### rename_cols:
  - rename remaining columns to fit PEP-style naming conventions

### build_features:
  - Run all of these functions to build our features
  - Normalize if chosen to do so


In [None]:
### Feature Engineering ###

def map_cabin(df: pd.DataFrame):
  df['has_cabin'] = df['Cabin'].map(lambda x: 1 - int(type(x) == float))
  df.drop(columns=['Cabin'], inplace=True)
  print('Feature:  has_cabin dummy variable created, Cabin col dropped')

def get_family_features(df: pd.DataFrame):
  df['family_size'] = df['SibSp'] + df['Parch'] + 1
  df['is_alone'] = 1
  df['is_alone'].loc[df['family_size'] > 0] = 0
  print('Feature:  family_size and is_alone dummy created')

def get_title_features(df: pd.DataFrame):
  df['title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
  df['title'] = df['title'].replace(['Don', 'Rev', 'Dr',
       'Major', 'Lady', 'Sir', 'Col', 'Capt', 'Countess',
       'Jonkheer'], 'noble')
  df['title'] = df['title'].replace('Mlle', 'Ms')
  df['title'] = df['title'].replace('Miss', 'Ms')
  df['title'] = df['title'].replace('Mme', 'Mrs')
  df['title'] = LabelEncoder().fit(df['title'].unique()).transform(df['title'])
  df['title'] = df['title'].fillna(0)
  df.drop(columns=['Name'], inplace=True)
  print('Feature:  title feature created and encoded using LabelEncoder')

def get_bands(df: pd.DataFrame, col: str, bins: List[float], labels: List[int]):
  if len(bins) - 1 != len(labels):
    raise AttributeError('The len of your bins is not equal to the len of your labels')
  kwargs = {
      'bins': bins,
      'labels': labels,
      'retbins': False
  }
  tx = FunctionTransformer(pd.cut, kw_args=kwargs).fit_transform(df[col])
  # kbdiscretizer = KBinsDiscretizer(n_bins=5, encode='ordingal') 
  # would use a kbinsdiscretizer here but wanted to set my own bins for our two continuous variables
  df[col.lower()] = LabelEncoder().fit(list(set(tx))).transform(tx)
  df.drop(columns=[col], inplace=True)
  print(f'Feature:  {col.lower()} feature was created using bands, {col} dropped')

def get_gender(df: pd.DataFrame):
  df['gender'] = np.where(df['Sex'] == 'male', 1, 0)
  df.drop(columns=['Sex'], inplace=True)
  print('Feature:  gender dummy created')

def map_embarked(df: pd.DataFrame):
  df['embarked'] = LabelEncoder().fit(df['Embarked'].unique()).transform(df['Embarked'])
  df.drop(columns=['Embarked'], inplace=True)
  print(f'Feature:  embarked feature created using LabelEncoder')

def rename_cols(df: pd.DataFrame):
  df.rename(columns={'Pclass': 'class',
             'SibSp': 'sibs_sps',
             'Parch': 'par_ch'}, inplace=True)

def build_features(df: pd.DataFrame, age_bins: List[float], 
                   age_labels: List[int], fare_bins: List[float],
                   fare_labels: List[int], norm: bool=False):
  map_cabin(df)
  get_family_features(df)
  get_title_features(df)
  get_bands(df, 'Age', age_bins, age_labels)
  get_bands(df, 'Fare', fare_bins, fare_labels)
  get_gender(df)
  map_embarked(df)
  rename_cols(df)
  if norm:
    df = pd.DataFrame(StandardScaler().fit_transform(df), columns=df.columns)
  print('\n\t========== Feature Engineering Completed ==========\n')
  return df

# Build our new DF with features
- Age bins:
  - Age is heavily distributed around age 30, which is roughly the mean
  - Age also has a short left tail and long right tail

- Fare bins:
  - Grouped by percentile. Will probably use pd.qcut in the future for this.

In [None]:
age_bins = [0, 15, 30, 45, 60, np.inf]
age_labels = list(range(len(age_bins) - 1))

fare_bins = np.percentile(X_train.Fare, [0, 25, 50, 75, 100])
fare_labels = list(range(len(fare_bins) - 1))

norm = True

X_train = build_features(X_train, age_bins, age_labels, fare_bins, fare_labels, norm=norm)
X_test = build_features(X_test, age_bins, age_labels, fare_bins, fare_labels, norm=norm)

Feature:  has_cabin dummy variable created, Cabin col dropped
Feature:  family_size and is_alone dummy created
Feature:  title feature created and encoded using LabelEncoder
Feature:  age feature was created using bands, Age dropped
Feature:  fare feature was created using bands, Fare dropped
Feature:  gender dummy created
Feature:  embarked feature created using LabelEncoder


Feature:  has_cabin dummy variable created, Cabin col dropped
Feature:  family_size and is_alone dummy created
Feature:  title feature created and encoded using LabelEncoder
Feature:  age feature was created using bands, Age dropped
Feature:  fare feature was created using bands, Fare dropped
Feature:  gender dummy created
Feature:  embarked feature created using LabelEncoder




# Feature Selection



In [None]:
# bs_feature_selector = BorutaShap(model=XGBClassifier(),
#                               importance_measure='shap',
#                               classification=True)
# bs_feature_selector.fit(X=X_train, 
#                         y=y_train,
#                         n_trials=500, 
#                         random_state=seed, 
#                         verbose=True)
# X_train_selected = bs_feature_selector.Subset()
# X_test_selected = X_test[bs_feature_selector.accepted]
                              
features = FeatureWiz(corr_limit=0.7, 
                      feature_engg='', 
                      category_encoders='',
                      dask_xgboost_flag=True,
                      nrows=None,
                      verbose=0)
X_train_selected = features.fit_transform(X_train, y_train)
X_test_selected = features.transform(X_test)



wiz = FeatureWiz(verbose=1)
        X_train_selected = wiz.fit_transform(X_train, y_train)
        X_test_selected = wiz.transform(X_test)
        wiz.features  ### provides a list of selected features ###            
        
############################################################################################
############       F A S T   F E A T U R E  E N G G    A N D    S E L E C T I O N ! ########
# Be judicious with featurewiz. Don't use it to create too many un-interpretable features! #
############################################################################################
Correlation Limit = 0.7
Skipping feature engineering since no feature_engg input...
Skipping category encoding since no category encoders specified in input...
    Since dask_xgboost_flag is True, reducing memory size and loading into dask
    Loaded train data. Shape = (891, 12)
#### Single_Label Binary_Classification problem ####
No test data filename given...
####################################

# SKOPT Search Space

We set the search space for our BayesianSearchCV

In [None]:
search_space = {
        'n_estimators': Integer(100, 1200),
        'max_depth': Integer(5, 30),
        'min_samples_split': Integer(2, 100),
        'min_samples_leaf': Integer(1, 10),
        'bootstrap': Categorical([True, False]),
        'max_features': Categorical(['log2', 'sqrt'])
  }

In [None]:
# skf = StratifiedKFold(n_splits=int(np.floor(1 + np.log2(len(X_train)))))

rf = RandomForestClassifier(n_estimators=433,
                            max_depth=5,
                            random_state=42,
                            min_samples_leaf=2,
                            min_samples_split=18
                            )

rf.fit(X_, y_train)

opt = BayesSearchCV(rf,
                    search_space)

opt.fit(X_train, y_train)

print(opt.best_score_)

0.8305065595380077


In [None]:
submission_template['Survived'] = pred
rf.score(X_train_selected, y_train)

0.8338945005611672

In [None]:
submission_template.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c titanic -f submission.csv -m "First attempt"

# We got a score of ~0.75

100% 4.30k/4.30k [00:02<00:00, 1.68kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster