In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import re

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Hypothesis generation:

Breaking down the problem, how did the survivors manage to survive, and what does this imply about the common characteristics of survivors?
* Securing a place on a lifeboat
    * Being close to the lifeboats
        * --> Having a cabin on a high deck
    * Being prioritised in the queue
        * --> Being a woman (due to 'women and children first' policy)
        * --> Being a child (due to 'women and children first' policy)
        * --> Having children (likely to have prioritised them over yourself)
        * --> Having parents (likely to have been prioritised over their parents)
        * --> Working on the ship (likely that workers stayed to help others)

* Choosing to stay
    * --> Being a man
    * --> Being a married woman (may have chosen to stay with husband)    
    
* Being rescued from the water
    * Being able to swim
    * Being strong / physically fit enough to tolerate the cold water  
    * --> Being a young adult    

# **Import data:**

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

train_data['train_test'] = 1
test_data['train_test'] = 0
test_data['Survived'] = np.NaN
all_data = pd.concat([train_data, test_data])

In [None]:
all_data.head()

# EDA:

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
pd.pivot_table(train_data, columns=['Sex'], values=['Survived'])

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train_data)

In [None]:
sns.violinplot(x="Pclass", y="Age", hue="Survived", data=train_data, split=True)

# Feature engineering:
* Impute missing values:
    * Use correlations with other features
    * Research missing data online
* Create new, useful features
* Normalise features
* Bin numeric features
* Group categorical features

In [None]:
all_data.isnull().sum()

## Embarked:

In [None]:
all_data['Embarked'] = all_data['Embarked'].fillna('S')

## Fare:

In [None]:
med_fare = all_data.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
# Filling the missing value in Fare with the median Fare of 3rd class alone passenger
all_data['Fare'] = all_data['Fare'].fillna(med_fare)

In [None]:
all_data['Fare'] = pd.qcut(all_data['Fare'], 13)

## **Age:**

In [None]:
all_data_corr = all_data.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
all_data_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
all_data_corr[all_data_corr['Feature 1'] == 'Age']

In [None]:
age_by_pclass_sex = all_data.groupby(['Sex', 'Pclass']).median()['Age']

print(age_by_pclass_sex)

# for sex in ['female', 'male']:
#    for pclass in range(1, 4):
#        print('Median age of Pclass {} {}s: {}'.format(pclass, sex, age_by_pclass_sex[sex][pclass]))
        
# print('Median age of all passengers: {}'.format(all_data['Age'].median()))

# Filling the missing values in Age with the medians of Sex and Pclass groups
all_data['Age'] = all_data.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))


#for name, group in all_data.groupby(['Sex', 'Pclass'])['Age']:
 #   group = group.fillna(group.median())
  #  print(name)
   # print(group)

In [None]:
all_data['Age'] = pd.qcut(all_data['Age'], 10)

## Name:

In [None]:
def extract_title(name):
    title = re.search('\S+\.', name).group(0)
    return title

In [None]:
all_data['Title'] = all_data['Name'].apply(extract_title)

In [None]:
all_data['Title'].value_counts()

In [None]:
all_data['Title'] = all_data['Title'].replace(['Miss.', 'Mrs.','Ms.', 'Mlle.', 'Lady.', 'Mme.', 'Countess.', 'Dona.'], 'Miss/Mrs/Ms')
all_data['Title'] = all_data['Title'].replace(['Dr.', 'Col.', 'Major.', 'Jonkheer.', 'Capt.', 'Sir.', 'Don.', 'Rev.'], 'Dr/Military/Noble/Clergy')

In [None]:
all_data['Title'].value_counts()

In [None]:
sns.barplot(x=all_data['Title'].value_counts().index, y=all_data['Title'].value_counts().values)

In [None]:
all_data['Is_Married'] = 0
all_data['Is_Married'].loc[all_data['Title'] == 'Mrs'] = 1

## Cabin:

In [None]:
all_data['Cabin'].value_counts()

In [None]:
def simplify_cabin(cabin):
    
    #simp_cabin = re.search('\w', cabin).group(0)
    
    simp_cabin = cabin[0]
    
    return simp_cabin

In [None]:
all_data['Cabin'] = all_data['Cabin'].fillna('M')

In [None]:
all_data['simp_Cabin'] = all_data['Cabin'].apply(simplify_cabin)

In [None]:
all_data['simp_Cabin'] = all_data['simp_Cabin'].replace(['A', 'B', 'C'], 'ABC')
all_data['simp_Cabin'] = all_data['simp_Cabin'].replace(['D', 'E'], 'DE')
all_data['simp_Cabin'] = all_data['simp_Cabin'].replace(['F', 'G'], 'FG')

all_data['simp_Cabin'].value_counts()

In [None]:
all_data.drop(['Cabin'], inplace=True, axis=1)

## Family size:

In [None]:
all_data['Family_Size'] = all_data['SibSp'] + all_data['Parch'] + 1

In [None]:
all_data['Family_Size'] = all_data['Family_Size'].replace(1, 'Alone')
all_data['Family_Size'] = all_data['Family_Size'].replace([2,3,4], 'Small')
all_data['Family_Size'] = all_data['Family_Size'].replace([5,6], 'Medium')
all_data['Family_Size'] = all_data['Family_Size'].replace([7,8,11], 'Large')

In [None]:
sns.barplot(x=all_data['Family_Size'].value_counts().index, y=all_data['Family_Size'].value_counts().values)

In [None]:
sns.countplot(x='Family_Size', hue='Survived', data=all_data)

# Encode categorical values and select features:

In [None]:
all_data.Pclass = all_data.Pclass.astype(str)

In [None]:
all_dummies = pd.get_dummies(all_data[['Pclass','Sex','Age','Family_Size','Fare','Embarked','simp_Cabin','Title', 'Is_Married','train_test']])

# (Scale features:)

# Split training and test data:

In [None]:
X_train = all_dummies[all_dummies.train_test == 1].drop(['train_test'], axis=1)
X_test = all_dummies[all_dummies.train_test == 0].drop(['train_test'], axis=1)

y_train = all_data[all_data.train_test==1].Survived
y_train.shape

# Define and instantiate model:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model_1 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)

from sklearn.naive_bayes import GaussianNB
model_2 = GaussianNB()

from sklearn.ensemble import RandomForestClassifier
model_3 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)

from sklearn.tree import DecisionTreeClassifier
model_4 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

from sklearn.linear_model import LogisticRegression
model_5 = LogisticRegression(random_state = 0)

from xgboost import XGBClassifier
model_6 = XGBClassifier()

from sklearn.ensemble import RandomForestClassifier
model_7 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

models = [model_1, model_2, model_3, model_4, model_5, model_6, model_7]

# Validate models:
from sklearn.model_selection import cross_val_score

model_idx = 1
for model in models:
    cv = cross_val_score(model, X_train, y_train, cv=5)
    print('model ' + str(model_idx))
    print(cv.mean())
    
    model_idx += 1

In [None]:
model = model_7

#from sklearn.svm import SVC
# model = SVC(probability = True)
## score: 0.73684

# Fit model:

In [None]:
model.fit(X_train, y_train)

# Make prediction:

In [None]:
predictions = model.predict(X_test).astype(int)

# Model validation:

In [None]:
from sklearn.model_selection import cross_val_score
cv = cross_val_score(model, X_train, y_train, cv=5)
print(cv)
print(cv.mean())

# Submit:

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
output