In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


### Import libraries and data

In [2]:
# Import libraries
import seaborn as sns
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as sk
from statsmodels.stats import outliers_influence
import scipy.stats as stats

In [3]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### Data transformation

In [4]:
# Add children indicator
train['Children'] = 0 
train.loc[pd.notnull(train['Age'])&(train['Age']<16),'Children']=1
test['Children'] = 0 
test.loc[pd.notnull(test['Age'])&(test['Age']<16),'Children']=1

# Add female indicator
train['Female'] = 0
train.loc[pd.notnull(train['Sex'])&(train['Sex']=='female'),'Female']=1
test['Female'] = 0
test.loc[pd.notnull(test['Sex'])&(test['Sex']=='female'),'Female']=1

In [5]:
# Encode dummies
def one_hot_encoding(df, col):
    df = pd.get_dummies(df, columns=col)
    return df

train = one_hot_encoding(train, col=['Pclass','SibSp','Parch'])
train = train.drop(columns=['Pclass_3', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8','Parch_3', 'Parch_4', 'Parch_5', 'Parch_6'])
test = one_hot_encoding(test, col=['Pclass','SibSp','Parch'])
test = test.drop(columns=['Pclass_3', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8','Parch_3', 'Parch_4', 'Parch_5', 'Parch_6'])

In [6]:
train.describe()

Unnamed: 0,PassengerId,Survived,Age,Fare,Children,Female,Pclass_1,Pclass_2,SibSp_0,SibSp_1,SibSp_2,Parch_0,Parch_1,Parch_2
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,29.699118,32.204208,0.093154,0.352413,0.242424,0.20651,0.682379,0.234568,0.031425,0.760943,0.132435,0.089787
std,257.353842,0.486592,14.526497,49.693429,0.290811,0.47799,0.42879,0.405028,0.465813,0.423966,0.174562,0.426747,0.339154,0.286037
min,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,20.125,7.9104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,446.0,0.0,28.0,14.4542,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,668.5,1.0,38.0,31.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
max,891.0,1.0,80.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Create interaction variables
2-way interaction of indicator variables

In [7]:
def create_interactions(var, df1, df2, interaction_list):
    for i in interaction_list:
        new_var = var+'_'+i
        df1[new_var] = df1[var]*df1[i]
        df2[new_var] = df2[var]*df2[i]
    return df1, df2

In [8]:
features = ['Children', 'Female', 'Pclass_1', 'Pclass_2', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'Parch_0', 'Parch_1', 'Parch_2']
train_X = train[features].copy()
test_X = test[features].copy()
features.remove('Children')
train_X, test_X = create_interactions('Children', train_X, test_X, features)
features.remove('Female')
train_X, test_X = create_interactions('Female', train_X, test_X, features)
train_X, test_X = create_interactions('Pclass_1', train_X, test_X, ['SibSp_0', 'SibSp_1', 'SibSp_2', 'Parch_0', 'Parch_1', 'Parch_2'])
train_X, test_X = create_interactions('Pclass_2', train_X, test_X, ['SibSp_0', 'SibSp_1', 'SibSp_2', 'Parch_0', 'Parch_1', 'Parch_2'])
train_X, test_X = create_interactions('SibSp_0', train_X, test_X, ['Parch_0', 'Parch_1', 'Parch_2'])
train_X, test_X = create_interactions('SibSp_1', train_X, test_X, ['Parch_0', 'Parch_1', 'Parch_2'])
train_X, test_X = create_interactions('SibSp_2', train_X, test_X, ['Parch_0', 'Parch_1', 'Parch_2'])
train_X.columns

Index(['Children', 'Female', 'Pclass_1', 'Pclass_2', 'SibSp_0', 'SibSp_1',
       'SibSp_2', 'Parch_0', 'Parch_1', 'Parch_2', 'Children_Female',
       'Children_Pclass_1', 'Children_Pclass_2', 'Children_SibSp_0',
       'Children_SibSp_1', 'Children_SibSp_2', 'Children_Parch_0',
       'Children_Parch_1', 'Children_Parch_2', 'Female_Pclass_1',
       'Female_Pclass_2', 'Female_SibSp_0', 'Female_SibSp_1', 'Female_SibSp_2',
       'Female_Parch_0', 'Female_Parch_1', 'Female_Parch_2',
       'Pclass_1_SibSp_0', 'Pclass_1_SibSp_1', 'Pclass_1_SibSp_2',
       'Pclass_1_Parch_0', 'Pclass_1_Parch_1', 'Pclass_1_Parch_2',
       'Pclass_2_SibSp_0', 'Pclass_2_SibSp_1', 'Pclass_2_SibSp_2',
       'Pclass_2_Parch_0', 'Pclass_2_Parch_1', 'Pclass_2_Parch_2',
       'SibSp_0_Parch_0', 'SibSp_0_Parch_1', 'SibSp_0_Parch_2',
       'SibSp_1_Parch_0', 'SibSp_1_Parch_1', 'SibSp_1_Parch_2',
       'SibSp_2_Parch_0', 'SibSp_2_Parch_1', 'SibSp_2_Parch_2'],
      dtype='object')

In [9]:
print(train_X.shape)
train_X.describe().iloc[:,0:20]

(891, 48)


Unnamed: 0,Children,Female,Pclass_1,Pclass_2,SibSp_0,SibSp_1,SibSp_2,Parch_0,Parch_1,Parch_2,Children_Female,Children_Pclass_1,Children_Pclass_2,Children_SibSp_0,Children_SibSp_1,Children_SibSp_2,Children_Parch_0,Children_Parch_1,Children_Parch_2,Female_Pclass_1
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.093154,0.352413,0.242424,0.20651,0.682379,0.234568,0.031425,0.760943,0.132435,0.089787,0.04826,0.006734,0.021324,0.025814,0.030303,0.006734,0.012346,0.043771,0.037037,0.105499
std,0.290811,0.47799,0.42879,0.405028,0.465813,0.423966,0.174562,0.426747,0.339154,0.286037,0.214436,0.08183,0.144544,0.158668,0.171516,0.08183,0.110485,0.2047,0.188959,0.307368
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=5, random_state=12)
model.fit(train_X, train["Survived"])
print(sk.accuracy_score(train['Survived'], model.predict(train_X)))

0.8237934904601572


In [11]:
predictions = model.predict(test_X)
output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
output.to_csv('random_forest.csv', index=False)
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
