## Titanic
This is the solution proposed to predict passengers survival in the titanic voyage
this notbook was executed in Kaggle kernel, you can download the data from Kaggle

In [217]:


import numpy as np 
import pandas as pd 
import os
print(os.listdir("../input"))


In [218]:
data= pd.read_csv('../input/train.csv')
data_val= pd.read_csv('../input/test.csv')

In [219]:
data_val.info()

In [220]:
data.head()

In [221]:
data.describe()

In [222]:
data['Survived'].value_counts()

In [223]:
import matplotlib.pyplot as plt
data.hist(bins=50, figsize=(10,10))
plt.show()

In [224]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

split= StratifiedShuffleSplit(n_splits=1, test_size=0.3)
for train_index, test_index in split.split(data, data['Survived']):
    train_data=data.loc[train_index]
    test_data=data.loc[test_index]

In [225]:
train_data['Survived'].value_counts()

In [226]:
corr_matrix= train_data.corr()
corr_matrix['Survived'].sort_values(ascending=False)

In [227]:
train_data.info()

In [228]:
X_train=train_data.drop('Survived', axis=1)
y_train=train_data['Survived'].copy()

In [229]:
train_data.head()
#X_train.describe()

In [230]:
X_test=test_data.drop('Survived', axis=1)
y_test=test_data['Survived'].copy()

In [231]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribut):
        self.attribut=attribut
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribut].values

In [232]:
X_train['Title']=X_train['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [233]:
title_names = (X_train['Title'].value_counts() < 10)
X_train['Title']=X_train['Title'].apply(lambda l: 'Misc' if title_names.loc[l]==True else l) 
print(X_train['Title'].value_counts())

In [234]:
X_train.info()

In [235]:
from sklearn.preprocessing import Imputer, LabelEncoder, LabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion

num_attribute=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_attribute=['Sex', 'Embarked', 'Title']
oth_attribute=['PassengerId','Name', 'Ticket', 'Cabin']
#X_train[cat_attribute].astype(str)
X_train['Embarked'].fillna(X_train['Embarked'].mode()[0], inplace = True)
X_train.isnull().sum()
X_train.head()


In [236]:
X_train["Title"].value_counts()

In [237]:
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)

imputer=Imputer(strategy="median")
scaler=StandardScaler()
label= LabelEncoder()
hotEncode= OneHotEncoder()
X_train[num_attribute]=imputer.fit_transform(X_train[num_attribute])
X_train=X_train.drop(columns=oth_attribute)
X_train=pd.get_dummies(X_train)
num_pipeline= Pipeline([
    ('selector', DataFrameSelector(num_attribute)),
    ('imputer', Imputer(strategy="median")),
    ('scaler', StandardScaler())
])

cat_pipeline= Pipeline([
    ('selector', DataFrameSelector(cat_attribute)),
    ('labelencod', CustomLabelBinarizer())
])

full_pipelie=FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
    ])

#X_train=full_pipelie.fit_transform(X_train)

In [238]:
X_train.head(10)

In [239]:
X_train.shape

In [240]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
sgd_clf=SGDClassifier()
sgd_clf.fit(X_train, y_train)
cross_val_score(sgd_clf, X_train, y_train, cv=3,  scoring="accuracy")

In [241]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
y_pred=cross_val_predict(sgd_clf, X_train, y_train, cv=3)
confusion_matrix(y_train, y_pred)

In [242]:
print(f1_score(y_train, y_pred))
print(precision_score(y_train, y_pred))
print(recall_score(y_train, y_pred))
print(accuracy_score(y_train, y_pred))

In [243]:
y_score=cross_val_predict(sgd_clf, X_train, y_train, cv=3, method="decision_function")

In [244]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds= precision_recall_curve(y_train, y_score)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [245]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train, y_score)
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, label=label)
    plt.plot([0,1],[0,1], 'k--')
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC CURVE")
plot_roc_curve(fpr, tpr)
plt.show()

In [246]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train, y_score)

In [247]:
from sklearn.ensemble import RandomForestClassifier

forest_clf=RandomForestClassifier(random_state=0)
forest_clf.fit(X_train, y_train)
cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy")

In [248]:
y_forest_pred=cross_val_predict(forest_clf, X_train, y_train, cv=3)
print("confusion_matrix is : ", confusion_matrix(y_train, y_forest_pred))
print("f1 score is : ", f1_score(y_train, y_forest_pred))
print("precision score is : ", precision_score(y_train, y_forest_pred))
print("recall score is : ", recall_score(y_train, y_forest_pred))
print("accuracy score is : ", accuracy_score(y_train, y_forest_pred))
y_forest_proba=cross_val_predict(forest_clf, X_train, y_train, cv=3, method="predict_proba")
y_forest_score=y_forest_proba[:,1]
precisions, recalls, thresholds= precision_recall_curve(y_train, y_forest_score)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
fpr, tpr, thresholds = roc_curve(y_train, y_forest_score)
plot_roc_curve(fpr, tpr)
plt.show()
print("AUC is : ", roc_auc_score(y_train, y_forest_score))

In [249]:
y_pred_1=(y_forest_score>0.67)
accuracy_score(y_train, y_pred_1)

In [250]:
from sklearn.kernel_approximation import RBFSampler

rbfsampler=RBFSampler(gamma=1, random_state=0)
X_feature= rbfsampler.fit_transform(X_train)
sgd_clf2= SGDClassifier()
sgd_clf2.fit(X_feature, y_train)
cross_val_score(forest_clf, X_feature, y_train, cv=3, scoring="accuracy")

In [251]:
from xgboost import XGBClassifier

xgb= XGBClassifier()
xgb.fit(X_train, y_train)
cross_val_score(xgb, X_train, y_train, cv=3, scoring="accuracy")

In [252]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [3, 10, 15, 30, 60, 120], 'max_features': [2, 4, 6, 8], 'min_samples_split':[2, 6, 18, 50] },
{'bootstrap': [False], 'n_estimators': [3, 10, 30, 60, 120], 'max_features': [2, 3, 4, 6, 8, 10, 12]},
]
forest_clf=RandomForestClassifier()
gridSearch=GridSearchCV(forest_clf, param_grid, cv=3, scoring="accuracy")
gridSearch.fit(X_train, y_train)

In [253]:
gridSearch.best_params_

In [254]:
gridSearch.best_score_

In [255]:
best_model=gridSearch.best_estimator_ 

In [256]:
X_test['Title']=X_test['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
X_test["Title"].value_counts()

In [257]:
X_test['Title']=X_test['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
title_names = (X_test['Title'].value_counts() < 5)
X_test['Title']=X_test['Title'].apply(lambda l: 'Misc' if title_names.loc[l]==True else l) 
X_test['Embarked'].fillna(X_test['Embarked'].mode()[0], inplace = True)
X_test[num_attribute]=imputer.transform(X_test[num_attribute])
X_test=X_test.drop(columns=oth_attribute)
X_test=pd.get_dummies(X_test)
X_test.head(10)

In [258]:
y_forest_predict=best_model.predict(X_test)
print("accuracy score of random forest is : ", accuracy_score(y_test, y_forest_predict))
y_xgboost_predict=xgb.predict(X_test)
print("accuracy score of xgboost is : ", accuracy_score(y_test, y_xgboost_predict))

In [259]:
data_val=pd.read_csv('../input/test.csv')
PassengerId=data_val["PassengerId"]
data_val['Title']=data_val['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
title_names = (data_val['Title'].value_counts() < 10)
data_val['Title']=data_val['Title'].apply(lambda l: 'Misc' if title_names.loc[l]==True else l) 
data_val['Embarked'].fillna(data_val['Embarked'].mode()[0], inplace = True)
data_val[num_attribute]=imputer.transform(data_val[num_attribute])
data_val=data_val.drop(columns=oth_attribute)
data_val=pd.get_dummies(data_val)
y_val=best_model.predict(data_val)

In [260]:
my_submission= pd.DataFrame({'PassengerId': PassengerId, 'Survived':y_val})
my_submission.to_csv('submission.csv', index=False)

In [261]:
my_submission.head()