# Titanic kaggle dataset

### Import libs

In [None]:
from sklearn import metrics, cross_validation, grid_search, linear_model

import warnings

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

warnings.filterwarnings('ignore')

In [None]:
%pylab inline

## Import data

In [None]:
data = pd.read_csv("../input/train.csv", header = 0, sep = ',')

## Inspect data

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data.describe()

## Visualization

In [None]:
sns.set(font_scale=1)
pd.options.display.mpl_style = 'default'
data.drop(['PassengerId', 'Survived', 'Pclass'], axis=1).hist(figsize=(10, 7), grid=False)
plt.show()

In [None]:
plt.figure()

plt.subplot(221)
data.Pclass.value_counts().plot(kind='bar', figsize=(10, 10))
plt.xlabel("Passenger class")
plt.ylabel("Count")
plt.title("Passenger class distribution")

plt.subplot(222)
data.Embarked.value_counts().plot(kind='bar', figsize=(10, 10))
plt.xlabel("Emabarked")
plt.ylabel("Count")
plt.title("Embarked distribution")
plt.show()

In [None]:
plt.figure(1)

plt.subplots(1, 1, figsize=(10, 10))
plt.subplot(221)
sns.barplot(y='Survived', x='Pclass', data=data)
plt.title("Survived by passenger class")

plt.subplot(222)
sns.barplot(y='Survived', x='Embarked', data=data)
plt.title("Survived by Embarked")
plt.show()

In [None]:
sns.barplot(y='Survived', x="Sex", data=data)
plt.title("Male/female survived distribution")
plt.ylabel("Survived")
plt.show()

In [None]:
plt.figure(1)

plt.subplots(1, 1, figsize=(10, 10))

plt.subplot(221)
ax = data[data.Survived == 1].Age.plot(kind='hist', alpha=0.5)
ax = data[data.Survived == 0].Age.plot(kind='hist', alpha=0.5)
plt.title("Age distribution")
plt.xlabel("Age")
plt.legend(("survived", "not survived"), loc='best')

plt.subplot(222)
data.Age.plot(kind='kde', grid=False)
plt.title("Age distribution")
plt.xlabel("Age")
plt.xlim((0,80))
plt.show()

In [None]:
corr = data.corr()

plt.figure(figsize=(10, 8))

sns.heatmap(corr, square=True)
plt.title("Feature correlations")

# Preprocessing

In [None]:
t_data = data.drop(['Cabin', 'Ticket', 'PassengerId', 'Survived'], axis=1)
t_labels = data['Survived']

In [None]:
t_data.head()

## Name inspecting/processing

In [None]:
t_data['Name_pred'] = data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
pd.crosstab(t_data['Name_pred'], t_data['Sex'])

In [None]:
t_data['Name_pred'] = t_data['Name_pred'].replace("Mlle", "Miss")
t_data['Name_pred'] = t_data['Name_pred'].replace("Ms", "Miss")
t_data['Name_pred'] = t_data['Name_pred'].replace("Mme", "Mrs")

In [None]:
t_data['Name_pred'] = t_data['Name_pred'].replace(['Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer',\
                                                  'Lady', 'Major', 'Rev', 'Sir'], 'Other')

In [None]:
preds = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Other': 5}

t_data['Name_pred'] = t_data['Name_pred'].map(preds)

In [None]:
t_data = t_data.drop('Name', axis=1)

In [None]:
t_data.head()

## Some categorical transformations
(Not really necessary)

In [None]:
t_data['Sex'] = t_data['Sex'].apply(lambda x: int(x == 'male'))

In [None]:
t_data.Embarked = t_data.Embarked.fillna(value='S')

In [None]:
emb = { 'S': 1, 'C': 2, 'Q': 3}

In [None]:
t_data.Embarked = t_data.Embarked.map(emb)

In [None]:
# zeros as first try
t_data.Age = t_data.Age.fillna(value=0)

In [None]:
t_data.head()

## Dividing by feature type

In [None]:
real_cols = ['Age', 'SibSp', 'Parch', 'Fare']
cat_cols = list(set(t_data.columns.values.tolist()) - set(real_cols))

In [None]:
X_real = t_data[real_cols]
X_cat = t_data[cat_cols]

## Categorical features encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

In [None]:
encoder = OneHotEncoder(categorical_features='all', sparse=True, n_values='auto')

In [None]:
X_cat.head()

In [None]:
X_cat_oh = encoder.fit_transform(X_cat).toarray()

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

X_real_scaled = scaler.fit_transform(X_real)

## Stacking

In [None]:
X = np.hstack((X_real_scaled, X_cat_oh))

In [None]:
(X_train, X_test, y_train, y_test) = cross_validation.train_test_split(X, t_labels,
                                                                      test_size=0.3,
                                                                      stratify=t_labels)

## First fitting SGDClassifier

In [None]:
clf = linear_model.SGDClassifier(class_weight='balanced')

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(metrics.roc_auc_score(y_test, clf.predict(X_test)))

In [None]:
param_grid = {
    'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss'],
    'penalty': ['l1', 'l2'],
    'n_iter': list(range(3, 10)),
    'alpha': np.linspace(0.0001, 0.01, num=10)
}

In [None]:
grid_cv = grid_search.GridSearchCV(clf, param_grid, scoring='accuracy', cv=3)

In [None]:
grid_cv.fit(X_train, y_train)

In [None]:
print(grid_cv.best_params_)

In [None]:
print(metrics.roc_auc_score(y_test, grid_cv.best_estimator_.predict(X_test)))

## Decision tree

In [None]:
from sklearn import tree

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=3, class_weight='balanced')

In [None]:
clf.get_params().keys()

In [None]:
params_grid = {
    'max_depth': list(range(1, 10)),
    'min_samples_leaf': list(range(2, 10))
}
grid_cv = grid_search.GridSearchCV(clf, params_grid, scoring='accuracy', cv=4)

In [None]:
grid_cv.fit(X_train, y_train)

In [None]:
print(grid_cv.best_params_)

In [None]:
print(metrics.roc_auc_score(y_test, grid_cv.best_estimator_.predict_proba(X_test)[:,1]))

## RandomForest

In [None]:
from sklearn import ensemble

In [None]:
rf_clf = ensemble.RandomForestClassifier()

In [None]:
rf_clf.get_params().keys()

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
print(metrics.roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:,1]))

In [None]:
params_grid = {
    'min_samples_leaf': list(range(1, 10)),
    'n_estimators': [10, 50, 100, 250, 500, 1000],
    'max_depth': list(range(1, 10))
}

rand_cv = grid_search.RandomizedSearchCV(rf_clf, params_grid, scoring='accuracy', cv=4, n_iter=40)

rand_cv.fit(X_train, y_train)

In [None]:
print(metrics.roc_auc_score(y_test, rand_cv.predict_proba(X_test)[:,1]))

## First test

In [None]:
test = pd.read_csv("../input/test.csv", header=0, sep=',')

In [None]:
test.head()

In [None]:
test.isnull().sum()

In [None]:
test_data = test.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1)

In [None]:
test_data['Name_pred'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
test_data['Name_pred'] = test_data['Name_pred'].replace("Mlle", "Miss")
test_data['Name_pred'] = test_data['Name_pred'].replace("Ms", "Miss")
test_data['Name_pred'] = test_data['Name_pred'].replace("Mme", "Mrs")

In [None]:
test_data['Name_pred'] = test_data['Name_pred'].replace(['Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer',\
                                              'Lady', 'Major', 'Rev', 'Sir'], 'Other')

In [None]:
preds = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Other': 5}
test_data['Name_pred'] = test_data['Name_pred'].map(preds)
test_data = test_data.drop('Name', axis=1)

In [None]:
test_data.Name_pred = test_data.Name_pred.fillna(value=5)

In [None]:
test_data.Name_pred = test_data.Name_pred.apply(int)

In [None]:
test_data['Sex'] = test_data['Sex'].apply(lambda x: int(x == 'male'))

In [None]:
test_data.Embarked = test_data.Embarked.fillna(value='S')
emb = { 'S': 1, 'C': 2, 'Q': 3}
test_data.Embarked = test_data.Embarked.map(emb)

In [None]:
test_data.Age = test_data.Age.fillna(value=0)

In [None]:
real_cols = ['Age', 'SibSp', 'Parch', 'Fare']
cat_cols = list(set(test_data.columns.values.tolist()) - set(real_cols))

In [None]:
Test_real = test_data[real_cols]
Test_cat = test_data[cat_cols]

In [None]:
encoder = OneHotEncoder(categorical_features='all', sparse=True, n_values='auto')
Test_cat_oh = encoder.fit_transform(Test_cat).toarray()

In [None]:
Test_real.Fare = Test_real.Fare.fillna(value=0)

In [None]:
scaler = StandardScaler()
X_real_scaled = scaler.fit_transform(Test_real)

In [None]:
X = np.hstack((Test_real, Test_cat_oh))

In [None]:
predict = rand_cv.predict(X)

In [None]:
submission = pd.DataFrame({
        "PassengerId": test.PassengerId,
        "Survived": predict
    })
submission.to_csv("predict.csv", index=False)

In [None]:
rand_cv.score(X_train, y_train)

In [None]:
print(rand_cv.best_estimator_)