In [None]:
# Load Libraries
import pandas as pd
from pandas import Series,DataFrame
import csv
import sklearn
from sklearn.linear_model import LogisticRegression

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

Read train dataset

In [None]:
df=pd.read_csv("../input/train.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

The age column has 177 missing values. To handle the missing values lets replace the nan values with median age values

In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True)

In [None]:
df.describe()

In [None]:
sns.barplot(x="Sex", y="Survived", data=df)

In [None]:
survived_sex=df[df['Survived']==1]['Sex'].value_counts()
dead_sex=df[df['Survived']==0]['Sex'].value_counts()
df_survived=pd.DataFrame([survived_sex, dead_sex])
df_survived.index=['Survived','Dead']
df_survived.plot(kind='bar',stacked=True)

Women are more likely to survive than men

In [None]:
plt.hist([df[df['Survived']==1]['Age'], df[df['Survived']==0]['Age']], stacked=True, color=['g','r'], bins=30,
         label=['Survived','Dead'])
plt.xlabel('Age')
plt.ylabel('Number of passengers')
plt.legend()

In [None]:
plt.hist([df[df['Survived']==1]['Fare'],df[df['Survived']==0]['Fare']], stacked=True, color=['g','r'],bins=30, label=['Survived','Dead'])
plt.xlabel('Fare')
plt.ylabel('Number of passengers')
plt.legend()

Passengers with less fare tickets are more likely to die than with expensive tickets



In [None]:
ax = plt.subplot()
ax.set_ylabel('Average fare')
df.groupby('Pclass').mean()['Fare'].plot(kind='bar',ax = ax)

In [None]:
sns.factorplot('Sex', kind='count', data=df)

In [None]:
sns.factorplot('Pclass',kind='count',data=df, hue='Sex')

Proportion of passengers survived based on their passenger class

In [None]:
xt=pd.crosstab(df['Pclass'],df['Survived'])
xt

In [None]:
xt.plot(kind='bar',stacked=True, title='Survival Rate by Passenger Classes')
plt.xlabel('Passenger Class')
plt.ylabel('Survival Rate')

Random Forest for training dataset

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

Create the decision trees

In [None]:
np.random.seed(12)

In [None]:
label_encoder=preprocessing.LabelEncoder()

In [None]:
# Convert sex and embarked variables to numeric
df['Sex']=label_encoder.fit_transform(df['Sex'].astype('str'))
df['Embarked']=label_encoder.fit_transform(df['Embarked'].astype('str'))

In [None]:
# Initialize the model
rf_model=RandomForestClassifier(n_estimators=1000, max_features=2,oob_score=True)
features=['Sex','Pclass','Embarked','Age','Fare']

In [None]:
# Train the model
rf_model.fit(X=df[features],y=df['Survived'].astype('str'))
print("OOB accuracy: ")
print(rf_model.oob_score_)

Features with higher importance were more influential in creating the model, indicating a stronger association with the response variable.


Feature importance for our random forest model:

In [None]:
for feature, imp in zip(features,rf_model.feature_importances_):
    print(feature,imp)

Use random forest model to make predictions on the test dataset

Read test dataset

In [None]:
test=pd.read_csv("../input/test.csv")

In [None]:
test.describe()

Impute the median age for NA age values

In [None]:
test['Age'].fillna(test['Age'].median(), inplace=True)

In [None]:
test.describe()

In [None]:
# Convert sex and embarked variables of test dataset to numeric
test['Sex']=label_encoder.fit_transform(test['Sex'].astype('str'))
test['Embarked']=label_encoder.fit_transform(test['Embarked'].astype('str'))

In [None]:
test.head()

In [None]:
test.fillna(test.mean(), inplace=True)

In [None]:
# Predictions for test set
test_preds = rf_model.predict(X=test[features])

In [None]:
submission=pd.DataFrame({"PassengerId": test["PassengerId"], "Survived":test_preds})
submission.to_csv('titanic1.csv', index=False)