# Titanic, pour rouler sur Arnaud

## Init Random Seed

In [None]:
import random
RANDOM_SEED=random.randint(1,500)
print(RANDOM_SEED)

## Load Data

In [None]:
import pandas as pd

In [None]:
raw_data=pd.read_csv("Data/train.csv")

In [None]:
raw_data.head()

## Make Train & Test Set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, test_set = train_test_split(raw_data, 
                                       test_size=0.2, 
                                       random_state=RANDOM_SEED, 
                                       stratify=raw_data["Survived"]
                                      )

## Make Train and Validator Set

In [None]:
train_set, validator_set = train_test_split(train_set, 
                                       test_size=0.2, 
                                       random_state=RANDOM_SEED, 
                                       #stratify=raw_data["Survived"]
                                      )

## Preprocessing and Feature Engineering

### Function Definition

Passenger ID, Ticket and Cabin aren't used

In [None]:
def datapreprocess(data):
    data=data.apply(pd.to_numeric, errors='ignore')
    
    # Y and X
    Y=data["Survived"]
    X=data.drop("Survived", axis=1,inplace=False)
    
    # Drop Passenger ID
    X.drop("PassengerId", axis=1, inplace=True)
    
    # Work on Title
    X['Title'] = X.Name.str.extract(r',\s*([^\.]*)\s*\.', expand=False)
    X['Title'] = X['Title'].replace(['Lady', 'the Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    X['Title'] = X['Title'].replace('Mlle', 'Miss')
    X['Title'] = X['Title'].replace('Ms', 'Miss')
    X['Title'] = X['Title'].replace('Mme', 'Mrs')
    X=pd.concat([X,pd.get_dummies(X["Title"])], axis=1)
    X.drop("Title", axis=1, inplace=True)
    
    # Work on Sex
    X['Sex'] = X['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Work on Age
    X['Age'].fillna(-9, inplace=True) # notify missing values to the algorithm
    
    # Work on Embarked
    X['Embarked'] = X['Embarked'].fillna('S')
    X['Embarked'] = X['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Work on family size
    X["FamilySize"]=X["SibSp"]+X["Parch"]+1
    X.drop("SibSp", axis=1, inplace=True)
    X.drop("Parch", axis=1,inplace=True)
    
    X['IsAlone'] = 0
    X.loc[X['FamilySize'] == 1, 'IsAlone'] = 1
    
    # Work on cabin
    X['Has_Cabin'] = X["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    
    # Exclude Objets
    X=X.select_dtypes(exclude=['object'])
    
    # Work on fare
    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values='NaN',strategy='median', axis=1)
    X=pd.DataFrame(imp.fit_transform(X),columns=X.columns.values)
    
    # Scale
    #from sklearn import preprocessing
    #X=pd.DataFrame(preprocessing.scale(X,axis=0))
    
    return X, Y

### Train Set Preprocessing

In [None]:
X_train,Y_train=datapreprocess(train_set)

### Validator Set Preprocessing

In [None]:
X_valid,Y_valid=datapreprocess(validator_set)

### Test Set Preprocessing

In [None]:
X_test, Y_test=datapreprocess(test_set)

## Correlation Print

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
colormap = plt.cm.viridis
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(X_train.astype(float).corr(),
            linewidths=0.1,
            vmax=1.0, 
            square=True, 
            cmap=colormap, 
            linecolor='white',
            annot=True)

## Models!

Imports

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

## Logistic Regression

### Logistic Regression Solo

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_clf=LogisticRegression(penalty='l2', 
                          dual=False, 
                          tol=0.0001, 
                          C=1.0, 
                          fit_intercept=True, 
                          intercept_scaling=1, 
                          class_weight=None, 
                          random_state=None, 
                          solver='liblinear', 
                          max_iter=100, 
                          multi_class='ovr', 
                          verbose=0, 
                          warm_start=False, 
                          n_jobs=-1)

In [None]:
lg_clf.fit(X_train, Y_train)

In [None]:
y_pred_lg = lg_clf.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_lg), 
                                                roc_auc_score(Y_test, y_pred_lg)
                                            )
     )

### Logistic Regression Grid Search

In [None]:
params = {'penalty': ['l2'],
          #'C':['1.0','10.0','100.0'],
          'dual': [True, False],
          'fit_intercept': [True, False],
          'max_iter' : [50, 100, 200, 400]
         }

In [None]:
lg_gs_cv = GridSearchCV(LogisticRegression(random_state=RANDOM_SEED, n_jobs=-1), 
                              params,
                              scoring='accuracy',
                              n_jobs=-1, 
                              verbose=1)

In [None]:
lg_gs_cv.fit(X_train, Y_train)

In [None]:
lg_gs_cv.best_estimator_

In [None]:
y_pred_cv_lg = lg_gs_cv.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_cv_lg), 
                                                roc_auc_score(Y_test, y_pred_cv_lg)
                                            )
     )

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Random Forest Solo

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=RANDOM_SEED)

In [None]:
rnd_clf.fit(X_train, Y_train)

In [None]:
y_pred_rf = rnd_clf.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_rf), 
                                                roc_auc_score(Y_test, y_pred_rf)
                                            )
     )

### Random Forest Grid Search

In [None]:
params = {'max_leaf_nodes': [4, 8, 12, 16, 18], 
          'n_estimators': [100, 250, 500],
          'min_samples_leaf': list(range(1, 6)),
          'min_samples_split' : list(range(2, 4))
         }

In [None]:
rf_gs_cv = GridSearchCV(RandomForestClassifier(random_state=RANDOM_SEED), 
                              params,
                              scoring='accuracy',
                              n_jobs=-1, 
                              verbose=1)

In [None]:
rf_gs_cv.fit(X_train, Y_train)

In [None]:
rf_gs_cv.best_estimator_

In [None]:
y_pred_cv_rf = rf_gs_cv.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_cv_rf), 
                                                roc_auc_score(Y_test, y_pred_cv_rf)
                                            )
     )

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

### Gradient Boosting solo

In [None]:
params = {'max_depth': 2, 
          'n_estimators': 3,
          'learning_rate': 1.0,
          'min_samples_leaf':1,
          'min_samples_split':2,
            'random_state':RANDOM_SEED
         }

In [None]:
gb_clf=GradientBoostingClassifier(max_depth=2, 
                                  n_estimators=3, 
                                  learning_rate=1.0,
                                  min_samples_leaf=1,
                                  min_samples_split=2,
                                  random_state=RANDOM_SEED
                                 )

In [None]:
gb_clf.fit(X_train, Y_train)

In [None]:
y_pred_gb = gb_clf.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_gb), 
                                                roc_auc_score(Y_test, y_pred_gb)
                                            )
     )

### Gradient Boosting Grid Search

In [None]:
params = {'max_depth': list(range(1, 6)), 
          'n_estimators': list(range(1,8)),
          'learning_rate': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0],
          'min_samples_leaf':list(range(1,3)),
          'min_samples_split':list(range(2,4))
         }

In [None]:
gb_gs_cv = GridSearchCV(GradientBoostingClassifier(random_state=RANDOM_SEED), 
                              params,
                                scoring='accuracy',
                              n_jobs=-1,
                              verbose=1)

In [None]:
gb_gs_cv.fit(X_train, Y_train)

In [None]:
gb_gs_cv.best_estimator_

In [None]:
y_pred_cv_gb = gb_gs_cv.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_cv_gb), 
                                                roc_auc_score(Y_test, y_pred_cv_gb)
                                            )
     )

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

### AdaBoost solo

In [None]:
ada_clf=AdaBoostClassifier( n_estimators=200, 
                           algorithm="SAMME.R", 
                           learning_rate=0.5, 
                           random_state=RANDOM_SEED
                          )

In [None]:
ada_clf.fit(X_train, Y_train)

In [None]:
y_pred_ada = ada_clf.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_ada), 
                                                roc_auc_score(Y_test, y_pred_ada)
                                            )
     )

### AdaBoost Grid Search

In [None]:
params = {'n_estimators':[50,100,200,300,400,500],
          'learning_rate':[0.5,1.0,1.5,2.0,2.5]
         }

In [None]:
ada_gs_cv = GridSearchCV(AdaBoostClassifier(algorithm="SAMME.R", random_state=RANDOM_SEED), 
                              params,
                                scoring='accuracy',
                              n_jobs=-1,
                              verbose=1)

In [None]:
ada_gs_cv.fit(X_train, Y_train)

In [None]:
ada_gs_cv.best_estimator_

In [None]:
y_pred_cv_ada = ada_gs_cv.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_cv_ada), 
                                                roc_auc_score(Y_test, y_pred_cv_ada)
                                            )
     )

## Voting

In [None]:
from sklearn.ensemble import VotingClassifier

### Voting Solo

In [None]:
voting_clf = VotingClassifier(
    estimators=[ ('lg',lg_gs_cv.best_estimator_), 
                ('rf', rf_gs_cv.best_estimator_), 
                ('ada',gb_gs_cv.best_estimator_), 
                ('gb',ada_gs_cv.best_estimator_)
               ],
    weights=[1,1,1,1],
    voting='soft')

In [None]:
voting_clf.fit(X_train, Y_train)

In [None]:
y_pred_vot = voting_clf.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_vot), 
                                                roc_auc_score(Y_test, y_pred_vot)
                                            )
     )

### Voting Grid Search

In [None]:
min_vot=-2
max_vot=5
params = {'weights': [[1,1,1,1],[2,1,1,1], [1,2,1,1],[1,1,2,1],[1,1,1,2],[2,2,1,1],[2,2,2,1],[1,2,2,1],[1,1,2,2]], 
          'voting': ['hard', 'soft']
         }

In [None]:
vot_gs_cv = GridSearchCV(VotingClassifier(
    estimators=[ ('lg',lg_gs_cv.best_estimator_), 
                ('rf', rf_gs_cv.best_estimator_), 
                ('ada',gb_gs_cv.best_estimator_), 
                ('gb',ada_gs_cv.best_estimator_)
               ]), 
                              params,
                              scoring='accuracy',
                              n_jobs=-1, 
                              verbose=1)

In [None]:
vot_gs_cv.fit(X_train, Y_train)

In [None]:
vot_gs_cv.best_estimator_

In [None]:
y_pred_cv_vot = vot_gs_cv.predict(X_test)

In [None]:
print("Accuracy score : {} \nROC score : {}".format(
                                                accuracy_score(Y_test, y_pred_cv_vot), 
                                                roc_auc_score(Y_test, y_pred_cv_vot)
                                            )
     )

## Summary

In [None]:
for clf in (lg_gs_cv.best_estimator_, 
            rf_gs_cv.best_estimator_, 
            ada_gs_cv.best_estimator_, 
            gb_gs_cv.best_estimator_, 
            vot_gs_cv.best_estimator_):
    clf.fit(X_train,Y_train)
    y_pred = clf.predict(X_test)
    print(str(clf.__class__.__name__)+" : "+ str(accuracy_score(Y_test, y_pred)))

## Stacking

In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [None]:
base_predictions_train = pd.DataFrame( 
    {'LogisticRegression': y_pred_cv_lg,
     'RandomForest': y_pred_cv_rf,
    'GradientBoost': y_pred_cv_gb,
     'AdaBoost': y_pred_cv_ada,
     'Voting' : y_pred_cv_vot
    })
base_predictions_train.head()

In [None]:
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Portland',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')


In [None]:
Y_valid.head()

## Submission File

In [None]:
raw_data_submission=pd.read_csv("Data/test.csv")

In [None]:
raw_data_submission.head()

In [None]:
def datapreprocess(data):
    X=data.apply(pd.to_numeric, errors='ignore')
    
    # Drop Passenger ID
    X.drop("PassengerId", axis=1, inplace=True)
    
    # Work on Title
    X['Title'] = X.Name.str.extract(r',\s*([^\.]*)\s*\.', expand=False)
    X['Title'] = X['Title'].replace(['Lady', 'the Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    X['Title'] = X['Title'].replace('Mlle', 'Miss')
    X['Title'] = X['Title'].replace('Ms', 'Miss')
    X['Title'] = X['Title'].replace('Mme', 'Mrs')
    X=pd.concat([X,pd.get_dummies(X["Title"])], axis=1)
    X.drop("Title", axis=1, inplace=True)
    
    # Work on Sex
    X['Sex'] = X['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Work on Age
    X['Age'].fillna(-9, inplace=True) # notify missing values to the algorithm
    
    # Work on Embarked
    X['Embarked'] = X['Embarked'].fillna('S')
    X['Embarked'] = X['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Work on family size
    X["FamilySize"]=X["SibSp"]+X["Parch"]+1
    X.drop("SibSp", axis=1, inplace=True)
    X.drop("Parch", axis=1,inplace=True)
    
    X['IsAlone'] = 0
    X.loc[X['FamilySize'] == 1, 'IsAlone'] = 1
    
    # Work on cabin
    X['Has_Cabin'] = X["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    
    # Exclude Objets
    X=X.select_dtypes(exclude=['object'])
    
    # Work on fare
    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values='NaN',strategy='median', axis=1)
    X=pd.DataFrame(imp.fit_transform(X),columns=X.columns.values)
    
    # Scale
    #from sklearn import preprocessing
    #X=pd.DataFrame(preprocessing.scale(X,axis=0))
    
    return X

In [None]:
clean_data_submission=datapreprocess(raw_data_submission)

In [None]:
clean_data_submission.head()

In [None]:
y_sub = rf_gs_cv.predict(clean_data_submission)

In [None]:
submission = pd.DataFrame({ 'PassengerId': raw_data_submission.PassengerId,
                            'Survived': y_sub })

In [None]:
submission.to_csv("RandomForestSubmission.csv", index=False)