In [33]:
# Libraries Used
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [34]:
# Data Collected
gender_submission= pd.read_csv("./gender_submission.csv")
test= pd.read_csv("./test.csv")
train= pd.read_csv("./train.csv")

In [35]:
test['FormalName']=test['Name'].str.split(",",expand=True)[1].str.split(".",expand=True)[0]
train['FormalName']=train['Name'].str.split(",",expand=True)[1].str.split(".",expand=True)[0]


In [36]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FormalName
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [37]:
train.FormalName

0         Mr
1        Mrs
2       Miss
3        Mrs
4         Mr
       ...  
886      Rev
887     Miss
888     Miss
889       Mr
890       Mr
Name: FormalName, Length: 891, dtype: object

In [50]:
temp=train.dropna()
temp.describe()
temp.corr(method ='kendall')

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FormalName
PassengerId,1.0,0.09221,-0.087215,-0.001569,1.0,-0.03376,-0.050156,-0.016736,0.02718,0.063237
Survived,0.09221,1.0,0.001163,-0.54832,0.09221,0.146362,0.061423,0.149746,-0.109695,-0.053599
Pclass,-0.087215,0.001163,1.0,-0.071882,-0.087215,-0.092206,0.080297,-0.449626,0.154241,-0.0373
Sex,-0.001569,-0.54832,-0.071882,1.0,-0.001569,-0.142187,-0.122391,-0.193768,0.072189,-0.039051
Age,1.0,0.09221,-0.087215,-0.001569,1.0,-0.03376,-0.050156,-0.016736,0.02718,0.063237
SibSp,-0.03376,0.146362,-0.092206,-0.142187,-0.03376,1.0,0.180695,0.317947,-0.041536,0.108508
Parch,-0.050156,0.061423,0.080297,-0.122391,-0.050156,0.180695,1.0,0.263646,0.027988,-0.115684
Fare,-0.016736,0.149746,-0.449626,-0.193768,-0.016736,0.317947,0.263646,1.0,-0.214516,-0.014206
Embarked,0.02718,-0.109695,0.154241,0.072189,0.02718,-0.041536,0.027988,-0.214516,1.0,0.002739
FormalName,0.063237,-0.053599,-0.0373,-0.039051,0.063237,0.108508,-0.115684,-0.014206,0.002739,1.0


In [38]:
# Fixing data

# Replace missing values
train['Age'] = train.groupby('FormalName').transform(lambda x: x.fillna(x.mean()))
test['Age'] = test.groupby('FormalName').transform(lambda x: x.fillna(x.mean()))
train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)

# Sex encoder
sex_le = preprocessing.LabelEncoder()
sex_le.fit(train.Sex)
train.Sex = sex_le.transform(train.Sex)
test.Sex = sex_le.transform(test.Sex)

# Embarked encoder
emb_le = preprocessing.LabelEncoder()
emb_le.fit(train.Embarked)
train.Embarked = emb_le.transform(train.Embarked)
test.Embarked = emb_le.transform(test.Embarked)

# Formal Name encoder
frm_le = preprocessing.LabelEncoder()
frm_le.fit(train.FormalName.append(test.FormalName))
train.FormalName = frm_le.transform(train.FormalName)
test.FormalName = frm_le.transform(test.FormalName)


# Cabin encoder
#cab_le = preprocessing.LabelEncoder()
#cab_le.fit(train.Cabin)
#train.Cabin = cab_le.transform(train.Cabin)
#test.Cabin = cab_le.transform(test.Cabin)


In [39]:
# Data analysis

#fare = train[['Pclass','Fare','Embarked']]
#fare.boxplot(by='Pclass',column=['Fare'])
#fare.boxplot(by='Embarked',column=['Fare'])
#test.describe()
train.corr(method ='kendall')

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FormalName
PassengerId,1.0,-0.00409,-0.026824,0.035079,1.0,-0.048394,0.000798,-0.008921,0.014102,0.039113
Survived,-0.00409,1.0,-0.323533,-0.543351,-0.00409,0.085915,0.133933,0.266229,-0.150419,-0.076397
Pclass,-0.026824,-0.323533,1.0,0.129325,-0.026824,-0.039552,-0.021019,-0.573531,0.075442,-0.06441
Sex,0.035079,-0.543351,0.129325,1.0,0.035079,-0.188694,-0.246536,-0.21348,0.106798,0.080593
Age,1.0,-0.00409,-0.026824,0.035079,1.0,-0.048394,0.000798,-0.008921,0.014102,0.039113
SibSp,-0.048394,0.085915,-0.039552,-0.188694,-0.048394,1.0,0.425241,0.358262,0.007914,-0.013242
Parch,0.000798,0.133933,-0.021019,-0.246536,0.000798,0.425241,1.0,0.33036,0.024229,-0.1174
Fare,-0.008921,0.266229,-0.573531,-0.21348,-0.008921,0.358262,0.33036,1.0,-0.046889,-0.002291
Embarked,0.014102,-0.150419,0.075442,0.106798,0.014102,0.007914,0.024229,-0.046889,1.0,0.066234
FormalName,0.039113,-0.076397,-0.06441,0.080593,0.039113,-0.013242,-0.1174,-0.002291,0.066234,1.0


In [40]:
# Default Values
train.Survived.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [41]:
# Default Submission
default_submission=pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': [0]*len(test.PassengerId)})
default_submission.to_csv("./submission.csv",index=False)


In [42]:
# Getting mae model
def get_mae(leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

def get_acc(leaf_nodes, train_X, val_X, train_y, val_y):
    #model = DecisionTreeClassifier(max_leaf_nodes=leaf_nodes, random_state=1)
    #model = DecisionTreeRegressor(max_leaf_nodes=leaf_nodes, random_state=1)
    model = RandomForestClassifier(n_estimators=leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    acc = accuracy_score(val_y, preds_val)
    return(acc)    

In [43]:
# Set features to use
titanic_features = ["Pclass", "Sex", "Parch", "Fare", "Embarked","FormalName","SibSp"]
y = train.Survived
X = train[titanic_features]
# Spliting data into training and validation
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0,test_size=0.25)
min_acc = 10
for leaf_nodes in range(2,100,2):
    acc = get_acc(leaf_nodes, train_X, val_X, train_y, val_y)
    if acc < min_acc:
        min_acc = acc
        best_leaf_nodes = leaf_nodes
print("Best leaf nodes: %d  \t\t Accuracy:  %f" %(best_leaf_nodes, min_acc))

Best leaf nodes: 2  		 Accuracy:  0.766816


In [44]:
#final_model = DecisionTreeClassifier(max_leaf_nodes=best_leaf_nodes,random_state=1)
#final_model = DecisionTreeClassifier(max_leaf_nodes=best_leaf_nodes,random_state=1)
final_model = RandomForestClassifier(n_estimators=best_leaf_nodes,random_state=0)
final_model.fit(X, y)
X_test = test[titanic_features]
preds_val = final_model.predict(X_test)
submission=pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': preds_val})
submission.to_csv("./submission.csv",index=False)