## Import data

In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from collections import Counter

In [115]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [116]:
def analize(x):
    x['NameLength'] = x['Name'].apply(len)
    x['HasCabin'] = x['Cabin'].apply(lambda i: 1 if type(i)==str else 0)
    x['FamilySize'] = 1 + x['Parch'] + x['SibSp']
    
    x['IsAlone'] = 0
    x.loc[x['FamilySize']==1, 'IsAlone'] = 1

    x['Embarked'] = x['Embarked'].fillna('S')
    x['Embarked'] = x['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

    x['Fare'] = x['Fare'].fillna(x['Fare'].median())
    CategoricalFare = pd.cut(x['Fare'], 4)
    # output: [(-0.512, 128.082] < (128.082, 256.165] < (256.165, 384.247] < (384.247, 512.329]]
    x.loc[x['Fare']<=128, 'Fare'] = 0
    x.loc[(x['Fare']>128) & (x['Fare']<=256), 'Fare'] = 1
    x.loc[(x['Fare']>256) & (x['Fare']<=384), 'Fare'] = 2
    x.loc[x['Fare']>384, 'Fare'] = 3
    x['Fare'] = x['Fare'].astype(int)


    # --- Age
    mean = x['Age'].mean()
    std = x['Age'].std()
    randomAge = np.random.randint(mean-std, mean+std, size=x['Age'].isnull().sum())
    x['Age'][np.isnan(x['Age'])] = randomAge
    CategoricalAge = pd.cut(x['Age'], 5)
    # output: [(0.34, 16.336] < (16.336, 32.252] < (32.252, 48.168] < (48.168, 64.084] < (64.084, 80.0]]
    x.loc[x['Age']<=16 , 'Age'] = 0
    x.loc[(x['Age']>16) & (x['Age']<=32), 'Age'] = 1
    x.loc[(x['Age']>32) & (x['Age']<=48), 'Age'] = 2
    x.loc[(x['Age']>48) & (x['Age']<=64), 'Age'] = 3
    x.loc[x['Age']>64, 'Age'] = 4
    x['Age'] = x['Age'].astype(int)

    # --- Title
    x['Title'] = [i.split(', ')[1].split('.')[0] for i in x['Name']]
    x['Title'] = x["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    x['Title'] = x['Title'].replace(['Mlle', 'Ms'], 'Miss')
    x['Title'] = x['Title'].replace('Mme', 'Mrs')
    x['Title'] = x['Title'].map({"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Rare": 4})

    x['Sex'] = x['Sex'].map({'male':0, 'female':1}).astype(int)
    
    x = x.drop(columns=['PassengerId','Name','SibSp','Parch','Ticket','Cabin'])
    return x

## 

## Prediction

In [117]:
trainLen = len(df_train)
train = analize(df_train)
test = analize(df_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [118]:
test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,NameLength,HasCabin,FamilySize,IsAlone,Title
0,3,0,2,0,2,16,0,1,1,0
1,3,1,2,0,0,32,0,2,0,2
2,2,0,3,0,2,25,0,1,1,0
3,3,0,1,0,0,16,0,1,1,0
4,3,1,1,0,0,44,0,3,0,2


In [119]:
y_train = train['Survived'].values
X_train = train.drop(columns=['Survived']).values
X_test = test.values

In [120]:
classifier = RandomForestClassifier(n_estimators=500, max_depth=6, max_features='sqrt')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [121]:
result = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': y_pred.astype(int)
})

In [122]:
result.to_csv('result.csv', index=False)