In [None]:
# Import modules
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split

# Figures inline and set visualization style
%matplotlib inline
sns.set()

In [None]:
os.listdir('../input')

In [None]:
# Import data
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [None]:
print(df_train.shape)
df_train.head()

In [None]:
print(df_test.shape)
df_test.head()

In [None]:
df = df_train.append(df_test, sort=False)
df.info()

In [None]:
# Dealing with missing numerical variables
df['Age'] = df.Age.fillna(df.Age.median())
df['Fare'] = df.Fare.fillna(df.Fare.median())
df.info()

In [None]:
#df = pd.get_dummies(df, columns=['Sex'], drop_first=True)

In [None]:
df['Surname'] = df['Name'].str.split(',').str[0]

In [None]:
df['Title'] = df['Name'].str.split(',').str[1].str.split().str[0]  

In [None]:
#df['Cabin Len'] = df.Cabin.str.split().str.len()

In [None]:
df['Cabin Letter'] = df['Cabin'].str[0]

In [None]:
df['Family_Size'] = df['SibSp'] + df['Parch']

In [None]:
df['Fare Per Person'] = df['Fare'] / (df['Family_Size'] + 1)

In [None]:
df['Number of Ticket Uses'] = df.groupby('Ticket', as_index=False)['Ticket'].transform(lambda s: s.count())

In [None]:
df['Average Fare per Person'] = df['Fare'] / df['Number of Ticket Uses'] 

In [None]:
for col in df.columns:  
    if df[col].dtype == 'object':
        df[col] = df[col].astype('category')  # change text to category
        df[col] = df[col].cat.codes  # save code as column value

In [None]:
# RandomForest/Decision Tree it is interesting to replace NA by a value less then the minimum or greater then the maximum
#df.fillna(-1, inplace=True)

In [None]:
data_train = df.iloc[:891].copy()
data_test = df.iloc[891:].copy()

In [None]:
train, test = train_test_split(data_train, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=2, max_features=.5, random_state=42)

In [None]:
remove = ['Survived', 'PassengerId', 'Name', 'Cabin', 'Embarked']
feats = [col for col in df.columns if col not in remove]

In [None]:
rf.fit(train[feats], train['Survived'])

In [None]:
preds_train = rf.predict(train[feats])

In [None]:
preds = rf.predict(test[feats])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(train['Survived'], preds_train)

In [None]:
accuracy_score(test['Survived'], preds)

In [None]:
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=3, max_features=.5, random_state=42)

In [None]:
# train with training and test dataset
rf.fit(data_train[feats],data_train['Survived'])

In [None]:
preds_kaggle = rf.predict(data_test[feats])

In [None]:
submission = pd.DataFrame({ 'PassengerId': data_test['PassengerId'],
                            'Survived': preds_kaggle }, dtype=int)
submission.to_csv("submission.csv",index=False)