In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
combine = [train, test]

In [None]:
train.head()

In [None]:
train.columns

In [None]:
excl = ['PassengerId', 'Survived', 'Ticket', 'Cabin', 'Name']
cols = [c for c in train.columns if c not in excl]

In [None]:
train['Name']

In [None]:
train[cols].head()

In [None]:
train['Pclass'][train['Pclass'] == 3].count()

In [None]:
sns.countplot(train['Pclass'])

In [None]:
sns.countplot(train['Deck'])

In [None]:
train.isnull().sum()

In [None]:
sns.countplot(train['Embarked'])

In [None]:
sns.distplot(train['Fare'].dropna())

In [None]:
sns.countplot(train['Parch'].dropna())

In [None]:
sns.countplot(train['SibSp'])

In [None]:
sns.distplot(train['Age'].dropna())

In [None]:
sns.countplot(train['Sex'])

In [None]:
sns.countplot(train['Survived'])

In [None]:
for df in combine:
    df['child'] = float('NaN')
    df["child"][df["Age"] < 18] = 1
    df["child"][df["Age"] >=18] = 0

In [None]:
train["Survived"][train["child"] == 1].value_counts(normalize = True)

In [None]:
train["Survived"][train["child"] == 0].value_counts(normalize = True)

In [None]:
for df in combine:
    # Convert the male and female groups to integer form
    df["Sex"][df["Sex"] == "male"] = 0
    df["Sex"][df["Sex"]== "female"] = 1

In [None]:
grid = sns.FacetGrid(train, row='Pclass', col='Sex', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()

In [None]:
guess_ages = np.zeros((2,3))
guess_ages

In [None]:
for df in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = df[(df['Sex'] == i) & \
                                  (df['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    df['Age'] = df['Age'].astype(int)


In [None]:
#Method for finding substrings
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if substring in big_string:
            return substring
    return np.nan

In [None]:
#Map titles
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']
for df in combine:
    df['Title'] = df['Name'].astype(str).map(lambda x: substrings_in_string(x, title_list))

In [None]:
#Replace rare titles
for df in combine:
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

In [None]:
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
#Change title to numnerics
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for df in combine:
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)


In [None]:
sns.countplot(train['Title'])

In [None]:
#Map cabins
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
for df in combine:
    df['Deck'] = df['Cabin'].astype(str).map(lambda x: substrings_in_string(x, cabin_list))

In [None]:
# Convert the Deck classes to integer form
for df in combine:
    df["Deck"][df["Deck"] == "A"] = 1
    df["Deck"][df["Deck"] == "B"] = 2
    df["Deck"][df["Deck"] == "C"] = 3
    df["Deck"][df["Deck"] == "D"] = 4
    df["Deck"][df["Deck"] == "E"] = 5
    df["Deck"][df["Deck"] == "F"] = 6
    df["Deck"][df["Deck"] == "G"] = 7
    df["Deck"][df["Deck"] == "T"] = 8

In [None]:
# Impute the Embarked variable
for df in combine:
    df["Deck"] = df["Deck"].fillna(0)

In [None]:
#Create family size feature
for df in combine:
    df['Family_size'] = df['SibSp']+df['Parch']+1

In [None]:
#Create fare per person
for df in combine:
    df['Fare_Per_Person']=df['Fare']/(df['Family_size']+1)

In [None]:
#Create isAlone feature
for df in combine:
    df['isAlone']=0
    df.loc[df['Family_size']==1, 'isAlone'] = 1

In [None]:
train[['isAlone', 'Survived']].groupby(['isAlone'], as_index=False).mean()

In [None]:
test[cols].head()

In [None]:
null_counts = test[cols].isnull().sum()/len(test[cols])

In [None]:
test[cols] = test[cols].fillna(0)

In [None]:
test[cols].head()

In [None]:
plt.figure(figsize=(16,8))
plt.xticks(np.arange(len(null_counts)), null_counts.index, rotation='vertical')
plt.ylabel('fraction of rows with missing data')
plt.bar(np.arange(len(null_counts)),null_counts)

In [None]:
for df in combine:
    # Impute the Embarked variable
    df["Embarked"] = df["Embarked"].fillna("S")

    # Convert the Embarked classes to integer form
    df["Embarked"][df["Embarked"] == "S"] = 0
    df["Embarked"][df["Embarked"] == "C"] = 1
    df["Embarked"][df["Embarked"] == "Q"] = 2

In [None]:
target = train["Survived"].values

In [None]:
features = train[cols].values

In [None]:
train[cols].head()

In [None]:
logr = LogisticRegression()
logr.fit(features, target)

In [None]:
logr.score(features, target)

In [None]:
coeff_df = pd.DataFrame(train[cols].columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logr.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

In [None]:
rfr = RandomForestClassifier(n_estimators=100, random_state=10, verbose=0)

In [None]:
rfmod = rfr.fit(features, target)

In [None]:
rfmod.score(features, target)

In [None]:
etc = ExtraTreesClassifier(n_estimators=100, max_depth=4, n_jobs=-1, random_state=1, verbose=0)
etcmod = etc.fit(features, target)

In [None]:
fi = etcmod.feature_importances_

In [None]:
importances = pd.DataFrame(fi, columns = ['importance'])
importances['feature'] = cols

In [None]:
importances.sort_values(by='importance', ascending=False)

In [None]:
test_features = test[cols].values

In [None]:
pred = etcmod.predict(test_features)

In [None]:
PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(pred, PassengerId, columns = ["Survived"])

In [None]:
my_solution.to_csv("extraTrees.csv", index_label = ["PassengerId"])