In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn import cross_validation, linear_model, metrics, pipeline, preprocessing
import math
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV

In [None]:
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")

**Data analysis**

In [None]:
train_data.head(2)

In [None]:
#let's check if we have missing data
train_data.info()

In [None]:
#let's take a look at data distributions
fig, axes = plt.subplots(3, 3, figsize=(9, 10))

sns.countplot(x='Survived', data=train_data, ax=axes[0,0])
sns.countplot(x='Pclass', data=train_data, ax=axes[0,1])
sns.countplot(x='Sex', data=train_data, ax=axes[0,2])
train_data[["Age", "SibSp", "Parch"]].hist(ax=axes[1,:])
train_data[["Fare"]].hist(ax=axes[2,0])
sns.countplot(x='Embarked', data=train_data, ax=axes[2,1])

In [None]:
#from first chart we can see that females had higher chances to survive
#also we see that higher cabin class seems to give higher chance to survive too
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(9,4))
sns.countplot(x='Survived', hue="Sex", data=train_data, ax=axis1)
sns.countplot(x='Survived', hue="Pclass", data=train_data, ax=axis2)

In [None]:
select = train_data[["Survived", "Pclass", "Age", "SibSp", "Parch", "Fare"]]

sex_mapping = {'female' : 0, 'male' : 1}
embarked_mapping = {'S' : 0, 'Q' : 1, 'C' : 2}

select = select.assign(Sex=[sex_mapping[item] for item in train_data["Sex"]])
select = select.assign(Embarked=[embarked_mapping[item] for item in train_data["Embarked"].fillna('S')])

In [None]:
#what we see here are 1) already mentioned correlation between Sex/Survived and Class/Survived 
#2) Fare/Class as expected 3) Age/Class (probably more aged people can afford to buy more expensive ones)
sns.heatmap(select.corr().abs(), square=True, annot=True, cmap="OrRd")

**Missing data**

In [None]:
#There are only 2 missing point in Embarked column which could be filled with S 
train_data["Embarked"] = train_data["Embarked"].fillna('S')

In [None]:
#There are almost 20% NA data points in Age so things might be little more complicated
train_data["Age"].describe()
#We can fill them with mean according to Sex but let's try out another approach

In [None]:
sns.boxplot(x="Pclass", y="Age", data=train_data);
# According to this chart we might fill NA ages with means according to passenger cabin class
# what would be better then previous  

In [None]:
# But let's try another approach and look at Name column. We can get passenger's Title from it:
train_data["Title"] = train_data["Name"].map(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
sns.countplot(y='Title', data=train_data, orient="v")

In [None]:
#now let's take a look who have a missing age by Title value
train_data["Title"][pd.isnull(train_data["Age"])].value_counts()

In [None]:
# we fill them by mean value according to Title and Class
train_data["Age"] = train_data.groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.replace(np.nan, x.mean()))

In [None]:
# also there are few rows with zero ticket price
train_data[train_data["Fare"] == 0].head(3)

In [None]:
#let's replace them with mean by class
train_data["Fare"] = train_data.groupby(['Pclass'])['Fare'].transform(lambda x: x.replace(0.0, x.mean()))

In [None]:
sns.boxplot(x="Title", 
            y="Age", 
            order=["Capt","Col","Major","Sir","Lady","Rev",
                   "Dr","Don","Jonkheer","Countess","Mrs", 
                    "Ms","Mr","Mme","Mlle","Miss","Master"], 
            data=train_data);

In [None]:
#Let's keep only 5 groups: Aristocratic, Mr, Mrs, Miss и Master
def change_title(title, fltr, new_title):
    if title in fltr: return new_title
    else: return title

train_data["Title"] = train_data["Title"].map(lambda x: change_title(x, ["Capt", "Col", "Don", 
                                                                         "Dr", "Jonkheer", "Lady", 
                                                                         "Major", "Rev", "Sir", "Countess"], "Aristocratic"))
train_data["Title"] = train_data["Title"].map(lambda x: change_title(x, ["Ms"], "Mrs"))
train_data["Title"] = train_data["Title"].map(lambda x: change_title(x, ["Mlle", "Mme"], "Miss"))

In [None]:
#looks nice!
sns.boxplot(x="Title", 
            y="Age",
            order=["Aristocratic", "Mrs", "Mr", "Miss", "Master"],
            data=train_data);

In [None]:
#here we can see percentage survived according to Title
title_perc = train_data[["Title", "Survived"]].groupby(['Title'],as_index=False).mean()
sns.barplot(x='Title', y='Survived', data=title_perc, palette=sns.color_palette("hls", 8))

In [None]:
#So we can prove again that it was more likely to survive for a women of 1 and 2 classes
#or for men of 1 class mostly
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(9,6))
sns.stripplot(x="Pclass", y="Age", hue="Survived", 
              data=train_data[train_data["Sex"] == "female"], jitter=True, ax=axis1)
sns.stripplot(x="Pclass", y="Age", hue="Survived", 
              data=train_data[train_data["Sex"] == "male"], jitter=True, ax=axis2)

In [None]:
#what about relatives onboard? Let's see how survival depends on Family size
train_data["Family"] = train_data["SibSp"] + train_data["Parch"]
sns.countplot(x='Family', hue="Survived", data=train_data[train_data["Sex"] == "male"])

In [None]:
#let's also add new binary column Single if psngr doesn't have family onboard
train_data["Single"] = (train_data["Family"] == 0).astype(int)

In [None]:
#CABIN. We don't have many cabin numbers available (20%) so there are no point in analysing it 
# but what can we get from availability of these numbers?
train_data["Cabin"] = train_data["Cabin"].map(lambda x: 0 if (pd.isnull(x)) else 1)

In [None]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(9,6))
sns.countplot(x='Cabin', hue="Survived", data=train_data, ax=axis1)
sns.countplot(x='Cabin', hue="Survived", 
              data=train_data[train_data["Sex"] == "male"], ax=axis2)

#so we see it may be important one specially for men

In [None]:
#so we grab all features and let's look at a correlation between them:
X = train_data[["Survived", "Age", "Family", "Fare", "Single", "Cabin"]]

X["BigFamily"] = (train_data["Family"] > 3).astype(int)
X["SmallFamily"] = (train_data["Family"] < 4).astype(int)

#Sex as Male
X = X.assign(Male=[{'female' : 0, 'male' : 1}[item] for item in train_data["Sex"]])

#Titles
title_dummies_titanic  = pd.get_dummies(train_data['Title'])
title_dummies_titanic.columns = ['Aristocratic','Mr','Master','Miss','Mrs']
X = X.join(title_dummies_titanic)

#Pclass
class_dummies_titanic  = pd.get_dummies(train_data['Pclass'])
class_dummies_titanic.columns = ['Class1','Class2','Class3']
X = X.join(class_dummies_titanic)

#Embarked
embarked_dummies_titanic  = pd.get_dummies(train_data['Embarked'])
embarked_dummies_titanic.columns = ['EmbarkedP','EmbarkedQ','EmbarkedS']
X = X.join(embarked_dummies_titanic)

sns.heatmap(X.corr(), square=True, annot=True, annot_kws={'fontsize' :8}, cmap="OrRd")

In [None]:
#All columns:
X.columns

In [None]:
def run_model(regressor, data, X_features, y_features):    
    X_ = data[X_features]
    y_ = data[y_features]
    
    accuracy_scoring = cross_validation.cross_val_score(regressor, X_, y_, scoring = accuracy_scorer, cv = cv_strategy)
    roc_auc_scoring = cross_validation.cross_val_score(regressor, X_, y_, scoring = roc_auc_scorer, cv = cv_strategy)

    print("Accuracy mean:", accuracy_scoring.mean())
    print("ROC AUC mean:", roc_auc_scoring.mean())
    
    for w in list(zip([ '%.2f' % elem for elem in regressor.fit(X_, y_).coef_[0]], X_.columns)):
        print(w)
    
#lets create 2 scores to score our models
accuracy_scorer = metrics.make_scorer(metrics.accuracy_score)
roc_auc_scorer = metrics.make_scorer(metrics.roc_auc_score) 

cv_strategy = cross_validation.StratifiedShuffleSplit(X["Survived"], n_iter = 20 , 
                                                      test_size = 0.2, 
                                                      random_state = 2)

lr = linear_model.LogisticRegression(C=0.1)
run_model(lr, X, ['Age', 'Family', 'Fare', 'Single', 'Cabin', 'BigFamily',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3', 'EmbarkedQ',
           'EmbarkedS'], "Survived")

In [None]:
#let's remove Fare (not statistical reasonable) and Sex (as Title already contains it) and EmbarkedQ

run_model(lr, X, ['Age', 'Family', 'Single', 'Cabin', 'BigFamily',
           'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

In [None]:
#let's remove Single (we have Family) and create separate feature for 3class men

X["Mr_Class3"] = (X["Class3"] * X["Mr"] == 1).astype(int)

run_model(lr, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

In [None]:
#Ok last chance. Lets do feature scalling for Age
X["Age"] = (X["Age"] - X["Age"].mean())/X["Age"].std()
run_model(lr, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

In [None]:
#Now lets try with Random forest
def run_model(regressor, data, X_features, y_features):    
    X_ = data[X_features]
    y_ = data[y_features]
    
    accuracy_scoring = cross_validation.cross_val_score(regressor, X_, y_, scoring = accuracy_scorer, cv = cv_strategy)
    roc_auc_scoring = cross_validation.cross_val_score(regressor, X_, y_, scoring = roc_auc_scorer, cv = cv_strategy)

    print("Accuracy mean:", accuracy_scoring.mean())
    print("ROC AUC mean:", roc_auc_scoring.mean())

rfc = RandomForestClassifier(n_estimators=200)
run_model(rfc, X, ['Age', 'Family', 'Fare', 'Single', 'Cabin', 'BigFamily',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3', 'EmbarkedQ',
           'EmbarkedS'], "Survived")

run_model(rfc, X, ['Age', 'Family', 'Single', 'Cabin', 'BigFamily',
           'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

run_model(rfc, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

In [None]:
#we see that we don't increase random forest score anyhow. lets try GradientBoostingClassifier
cls = GradientBoostingClassifier()
run_model(cls, X, ['Age', 'Family', 'Fare', 'Single', 'Cabin', 'BigFamily',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3', 'EmbarkedQ',
           'EmbarkedS'], "Survived")
run_model(cls, X, ['Age', 'Family', 'Single', 'Cabin', 'BigFamily',
           'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")
run_model(cls, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

In [None]:
#that was little better. let's try out with SVM:
from sklearn.svm import SVC

#SVC is very sensitive to unscalled data
X["Fare"] = (X["Fare"] - X["Fare"].mean())/X["Fare"].std()

lin_svm = SVC(C=0.5)
run_model(lin_svm, X, ['Age', 'Family', 'Fare', 'Single', 'Cabin', 'BigFamily',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3', 'EmbarkedQ',
           'EmbarkedS'], "Survived")
run_model(lin_svm, X, ['Age', 'Family', 'Single', 'Cabin', 'BigFamily',
           'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")
run_model(lin_svm, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

In [None]:
#So as we see SVC predict better than other methods so let's try to predict survivals for our test data
test_data.info()

In [None]:
#let's fill missing data as we did for train data AGE
test_data["Title"] = test_data["Name"].map(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
test_data["Title"] = test_data["Title"].map(lambda x: change_title(x, ["Capt", "Col", "Don", "Dona", 
                                                                         "Dr", "Jonkheer", "Lady", 
                                                                         "Major", "Rev", "Sir", "Countess"], "Aristocratic"))
test_data["Title"] = test_data["Title"].map(lambda x: change_title(x, ["Ms"], "Mrs"))
test_data["Title"] = test_data["Title"].map(lambda x: change_title(x, ["Mlle", "Mme"], "Miss"))

test_data["Age"] = test_data.groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.replace(np.nan, x.mean()))

In [None]:
#missing Fare
test_data["Fare"] = test_data.groupby(['Pclass'])['Fare'].transform(lambda x: x.replace(0.0, x.mean()))
test_data["Fare"] = test_data.groupby(['Pclass'])['Fare'].transform(lambda x: x.replace(np.nan, x.mean()))

In [None]:
#missing cabin
test_data["Cabin"] = test_data["Cabin"].map(lambda x: 0 if (pd.isnull(x)) else 1)

In [None]:
test_data["Family"] = test_data["SibSp"] + test_data["Parch"]
test_data["Single"] = (test_data["Family"] == 0).astype(int)
test_data.drop("Name", axis=1, inplace=True)
test_data.drop("Ticket", axis=1, inplace=True)

In [None]:
T = test_data[["Age", "Fare", "Single", "Cabin"]]

T["BigFamily"] = (test_data["Family"] > 3).astype(int)
T["SmallFamily"] = (test_data["Family"] < 4).astype(int)

#Sex as Male
T = T.assign(Male=[{'female' : 0, 'male' : 1}[item] for item in test_data["Sex"]])

#Titles
title_dummies_test  = pd.get_dummies(test_data['Title'])
title_dummies_test.columns = ['Aristocratic','Mr','Master','Miss','Mrs']
T = T.join(title_dummies_test)

#Pclass
class_dummies_test  = pd.get_dummies(test_data['Pclass'])
class_dummies_test.columns = ['Class1','Class2','Class3']
T = T.join(class_dummies_test)

#Embarked
embarked_dummies_test  = pd.get_dummies(test_data['Embarked'])
embarked_dummies_test.columns = ['EmbarkedP','EmbarkedQ','EmbarkedS']
T = T.join(embarked_dummies_test)

In [None]:
#also scalling for test data
T["Fare"] = (T["Fare"] - T["Fare"].mean())/T["Fare"].std()
T["Age"] = (T["Age"] - T["Age"].mean())/T["Age"].std()

In [None]:
X.head()

In [None]:
cols = ['Age', 'Fare', 'Single', 'Cabin', 'BigFamily', 'SmallFamily',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Aristocratic', 'Class1', 'Class2', 'Class3']

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier(n_estimators=200)
clf = clf.fit(X[cols], X["Survived"])

features = pd.DataFrame()
features['feature'] = cols
features['importance'] = clf.feature_importances_
features.sort(['importance'],ascending=False)

In [None]:
cols = ['Age', 'Fare', 'Single', 'Cabin', 'BigFamily',
           'Male', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3']

In [None]:
#and finally let's predict survivals!!!
rfc = RandomForestClassifier(max_features='sqrt', n_estimators=210, criterion='gini', max_depth=4)

rfc.fit(X[cols], X["Survived"])
print(metrics.accuracy_score(rfc.predict(X[cols]), X["Survived"]))

In [None]:
svm = SVC(C=0.5)
svm.fit(X[cols], X["Survived"])
print(metrics.accuracy_score(svm.predict(X[cols]), X["Survived"]))

predictions = svm.predict(T[cols])

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": predictions
    })
submission.to_csv('titanic.csv', index=False)