## Imports

In [24]:
# pip install --upgrade scikit-learn

In [40]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sns
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline      
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24

Pandas   1.1.3
Sklearn  0.24.2


## Get the dataset
- **CLOUD = True**: Download dataset from Kaggle. Necesary for cloud enviroments like COLAB. **Specify your [kaggle credentials](https://www.kaggle.com/docs/api)**.
- **CLOUD = False**: Get the dataset from your local machine. **Specify the data path**.

In [26]:
DATA_PATH = "../../Datasets/Tabular/titanic/"

## Load data

In [41]:
df      = pd.read_csv("train.csv", index_col='PassengerId')
df_test = pd.read_csv("test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


## Check missings

In [42]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [43]:
df_test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

# Exercise 1 (2pts):
Extract the title (Mr, Mrs, ... ) from the "Name" column.

Tips:
- split(',')[1] to get the 2nd part, and remove the surnamename
- split('.')[0] to get the 1str part, and remove the name

In [44]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
# CODE HERE get_Title_from_Name funtion
# Create this function using lambda (not def)

get_Title_from_Name = lambda name: name.split(',')[1].split('.')[0].strip()

df['Title']      = df['Name'].map(get_Title_from_Name)
df_test['Title'] = df_test['Name'].map(get_Title_from_Name)


In [46]:
df['Title'].values[0] 
df['Title'].values[1] 
df['Title'].values[2] 
df_test['Title'].values[0] 
df_test['Title'].values[1] 
df_test['Title'].values[414]

'Dona'

# Exercise 2 (1pts):
Apply the title_dictionary to get a better information about the title. You have to overwrite the Title variable.

In [47]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [48]:
# Use map to apply the prevous dict

df["Title"] =  df.Title.map(title_dictionary)
df_test["Title"] = df_test.Title.map(title_dictionary)



In [49]:
print(df['Title'].values[886])
print(df_test['Title'].values[417])

Officer
Master


# Exercise OPTINAL (0pts):
Try to extract some information from the feature **Ticket**. Search on Internet if that colum has some kind of information.

In [50]:
df_ticket = df
df_ticket = df_ticket.sort_values(by='Ticket')
#print(df_ticket['Ticket'].head(20))
print(df_ticket['Ticket'].tail(20))

PassengerId
730    STON/O2. 3101271
143    STON/O2. 3101279
404    STON/O2. 3101279
3      STON/O2. 3101282
217    STON/O2. 3101283
817    STON/O2. 3101290
227           SW/PP 751
527         W./C. 14258
243         W./C. 14263
784          W./C. 6607
889          W./C. 6607
148          W./C. 6608
437          W./C. 6608
87           W./C. 6608
737          W./C. 6608
236          W./C. 6609
93          W.E.P. 5734
220           W/C 14208
541           WE/P 5735
746           WE/P 5735
Name: Ticket, dtype: object


# Exercise OPTIONAL (0pts):
Try to extract some information from the feature **Cabin**. Search on Internet if that colum has some kind of information.

PassengerId
892      NaN
893      NaN
894      NaN
895      NaN
896      NaN
        ... 
1305     NaN
1306    C105
1307     NaN
1308     NaN
1309     NaN
Name: Cabin, Length: 418, dtype: object

# Preprocessing
For X data, notice that...
- We drop Survived because is the target variable
- We drop Name because we have extracted the Title: Mr, Mrs, ...
- We drop Ticket because it has no information -> see df.Ticket.nunique()
- We drop Cabin because it has a lot of missings (77% are missings)

Then, we identify **numerical** variables and **categorical** variables,

In [51]:
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes

print(len(x), len(y))

x_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin']) # # X_TEST DATA (NEW DATA)

891 891


In [52]:
cat_vars  = ['Sex', 'Embarked', 'Title']         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

Categorical features:
 ['Sex', 'Embarked', 'Title']


# Exercise 3 (2pts):
Create a **ColumnTransformer for Tree Models**. You need to create 2 pipelines (one for numerical and other for categories). Remember:
- Categorical pipeline: Some SimpleImputer -> Some Encoder
- Numerical pipeline: Some SimpleImputer -> NO Encoder

In [53]:
"""
num_preprocessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
])

cat_preporcessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
  # Some Encoder here. Remember to handle_unknown
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_preprocessing, num_vars),
    ('cat', cat_preporcessing, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro
""";

### BEGIN SOLUTION
num_preprocessing = Pipeline([('imputer', SimpleImputer())])
cat_preporcessing = Pipeline([('imputer',SimpleImputer(strategy='constant', fill_value='missing')),
  ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1))
])

tree_prepro = ColumnTransformer([('num', num_preprocessing, num_vars),('cat', cat_preporcessing, cat_vars)])
tree_prepro

In [40]:
assert type(tree_prepro)      is compose._column_transformer.ColumnTransformer
assert type(num_4_treeModels) is pipeline.Pipeline
assert type(cat_4_treeModels) is pipeline.Pipeline
assert len(num_4_treeModels) == 1
assert len(cat_4_treeModels) == 2

# Exercise 4 (1pts):
1. Complete the diccionary with some Tree Models.
2. Then we put each model in a Pipeline where:
   - first is the prepocessing with the column Transformer
   - Then is the Tree model
3. Display the fullpipeline of the LGBMClassifier

In [54]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier

In [55]:

tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees": ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost": AdaBoostClassifier(n_estimators=100),
  "Skl GBM": GradientBoostingClassifier(n_estimators=100),
  "Skl HistGBM": HistGradientBoostingClassifier(max_iter=100),
  "XGBoost": XGBClassifier(n_estimators=100),
  "LightGBM": LGBMClassifier(n_estimators=100),
  "CatBoost":  CatBoostClassifier(n_estimators=100),
}
tree_classifiers = {name: make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}
tree_classifiers["LightGBM"]




In [20]:
for pipe in tree_classifiers.values():
    assert type(pipe) is Pipeline

# Exercise 5 (3pts):
Define a simple split validation strategy with:
- 80% for train
- 20% for validation
- With stratification
- random_state=0

And train all the models in a for loop

In [56]:

x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=0.2, random_state=0, stratify=y)

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    model.fit(x_train, y_train)
    
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": accuracy_score(y_val, pred)*100,
                              "Bal Acc.": balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')



Learning rate set to 0.073611
0:	learn: 0.6697764	total: 186ms	remaining: 18.4s
1:	learn: 0.6443246	total: 188ms	remaining: 9.22s
2:	learn: 0.6218304	total: 190ms	remaining: 6.14s
3:	learn: 0.6043138	total: 191ms	remaining: 4.58s
4:	learn: 0.5863634	total: 192ms	remaining: 3.65s
5:	learn: 0.5673675	total: 193ms	remaining: 3.03s
6:	learn: 0.5502252	total: 195ms	remaining: 2.59s
7:	learn: 0.5385529	total: 196ms	remaining: 2.25s
8:	learn: 0.5278454	total: 197ms	remaining: 1.99s
9:	learn: 0.5156959	total: 200ms	remaining: 1.8s
10:	learn: 0.5069669	total: 202ms	remaining: 1.63s
11:	learn: 0.4964747	total: 222ms	remaining: 1.63s
12:	learn: 0.4879102	total: 223ms	remaining: 1.49s
13:	learn: 0.4788633	total: 224ms	remaining: 1.38s
14:	learn: 0.4714145	total: 225ms	remaining: 1.28s
15:	learn: 0.4639455	total: 227ms	remaining: 1.19s
16:	learn: 0.4575363	total: 228ms	remaining: 1.11s
17:	learn: 0.4512419	total: 229ms	remaining: 1.04s
18:	learn: 0.4466799	total: 231ms	remaining: 985ms
19:	learn: 0

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Skl GBM,84.916201,82.59552,0.564237
2,XGBoost,81.564246,78.787879,1.328303
3,Skl HistGBM,81.005587,78.603426,1.192094
4,LightGBM,81.005587,78.333333,0.80264
5,CatBoost,79.888268,76.613966,1.645221
6,Decision Tree,78.77095,77.055336,2.524633
7,Random Forest,78.77095,76.245059,0.387233
8,AdaBoost,78.77095,76.785244,0.335064
9,Extra Trees,75.977654,73.432148,0.681845


In [45]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 6 (3pts):
Define a 10 Fold cross validation strategy with:
- With stratification
- shuffle=True
- random_state=0

And train all the models in a for loop.

Tip you can use **[cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)** for both training and predict with 

In [57]:

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
        
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = cross_val_predict(model, x,y, cv=skf)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": accuracy_score(y, pred)*100,
                              "Bal Acc.": balanced_accuracy_score(y, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')































Learning rate set to 0.077408
0:	learn: 0.6700140	total: 403us	remaining: 39.9ms
1:	learn: 0.6454774	total: 1.71ms	remaining: 83.7ms
2:	learn: 0.6279862	total: 2.18ms	remaining: 70.5ms
3:	learn: 0.6056591	total: 3.33ms	remaining: 80ms
4:	learn: 0.5893181	total: 4.93ms	remaining: 93.7ms
5:	learn: 0.5711032	total: 5.99ms	remaining: 93.8ms
6:	learn: 0.5558844	total: 7.15ms	remaining: 95ms
7:	learn: 0.5416680	total: 8.28ms	remaining: 95.2ms
8:	learn: 0.5288758	total: 9.43ms	remaining: 95.4ms
9:	learn: 0.5181279	total: 10.8ms	remaining: 96.9ms
10:	learn: 0.5084073	total: 12.1ms	remaining: 98.1ms
11:	learn: 0.4978706	total: 13.4ms	remaining: 98ms
12:	learn: 0.4889472	total: 14.6ms	remaining: 97.8ms
13:	learn: 0.4839765	total: 15.3ms	remaining: 94.2ms
14:	learn: 0.4763502	total: 17.7ms	remaining: 100ms
15:	learn: 0.4693592	total: 19.6ms	remaining: 103ms
16:	learn: 0.4624800	total: 20.9ms	remaining: 102ms
17:	learn: 0.4575181	total: 22.1ms	remaining: 101ms
18:	learn: 0.4530218	total: 23.4ms	re

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Skl GBM,83.277217,81.027706,1.053006
2,CatBoost,82.716049,80.296712,4.83948
3,Skl HistGBM,82.491582,80.831176,8.354669
4,LightGBM,82.491582,80.8863,1.409075
5,AdaBoost,81.930415,80.927044,1.667023
6,XGBoost,81.930415,80.430927,2.336044
7,Random Forest,81.257015,79.829355,1.891082
8,Extra Trees,80.47138,79.191832,1.712929
9,Decision Tree,79.79798,78.369763,0.250333


In [47]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 7.1
Train with all data the best model

In [63]:
best_model = tree_classifiers["Skl GBM"]

best_model.fit(x,y)

best_model


# Exercise 7.2 (2pts)
With your best model, generate the predicitions for test data (x_test)

In [60]:
x_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,male,34.5,0,0,7.8292,Q,Mr
893,3,female,47.0,1,0,7.0,S,Mrs
894,2,male,62.0,0,0,9.6875,Q,Mr
895,3,male,27.0,0,0,8.6625,S,Mr
896,3,female,22.0,1,1,12.2875,S,Mrs


In [64]:
# test_pred = # Get the predictions for x_test
test_pred = best_model.predict(x_test)
print(len(test_pred))
np.unique(test_pred).tolist()



418


[0, 1]

In [50]:
assert len(test_pred) == 418
assert np.unique(test_pred).tolist() == [0,1]

# Exercise 7.3

Submit to kaggle.

- You can use the kaggle command line app. Check https://github.com/Kaggle/kaggle-api

In [65]:
sub = pd.DataFrame(test_pred, index=x_test.index, columns=["Survived"])
sub.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [66]:
sub.to_csv("sub.csv")

In [72]:
!kaggle competitions submit -c titanic -f sub.csv -m "My submission message"

'kaggle' is not recognized as an internal or external command,
operable program or batch file.
