In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
train_df=pd.read_csv('../input/train.csv')
test_df=pd.read_csv('../input/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

# Data Visualization

In [None]:
#Calculating the number of male/female passenger Survived.
sns.countplot(x='Survived',data=train_df,hue='Sex')

In [None]:
#Plotting the percentage of passengers survived according to the Class they were in. 
sns.factorplot(x='Pclass',data=train_df,kind='count',hue='Survived')

In [None]:
#Further breaking the above graph to male/female level
sns.factorplot(x='Survived',data=train_df,hue='Sex',kind='count',col='Pclass')

In [None]:
#Age distribution of the passengers
sns.distplot(train_df['Age'].dropna(),bins=30,kde=False)

In [None]:
#Survivers according to their gender and Pclass
sns.factorplot(x='Pclass',y='Survived',data=train_df,hue='Sex')

# Data Cleaning

In [None]:
train_df.info()
print('_'*40)
test_df.info()

In [None]:
#Dropping Cabin column from both datasets
train_df.drop(['Cabin'],inplace=True,axis=1)
test_df.drop(['Cabin'],inplace=True,axis=1)

In [None]:
train_df['Embarked']=train_df['Embarked'].fillna('S')

In [None]:
test_df['Fare']=test_df['Fare'].fillna(test_df['Fare'].mean())

In [None]:
train_df.head()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='Pclass',y='Age',data=train_df)

In [None]:
def age_mean(x):
    Age,Pclass=x
    if pd.isnull(Age):
        if Pclass==1:
            return 37
        elif Pclass==2:
            return 28
        else:
            return 24
    else:
        return Age

In [None]:
train_df['Age']=train_df[['Age','Pclass']].apply(age_mean,axis=1)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='Pclass',y='Age',data=test_df)

In [None]:
def age_mean_test(x):
    Age,Pclass=x
    if pd.isnull(Age):
        if Pclass==1:
            return 43
        elif Pclass==2:
            return 26
        else:
            return 25
    else:
        return Age

In [None]:
test_df['Age']=test_df[['Age','Pclass']].apply(age_mean_test,axis=1)

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(train_df.isnull())

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(test_df.isnull())

In [None]:
def m_f(x):
    Sex=x
    if Sex=='male':
        return 1
    else:
        return 0

In [None]:
train_df['Sex']=train_df['Sex'].apply(m_f)
test_df['Sex']=test_df['Sex'].apply(m_f)

In [None]:
train_df.head()

In [None]:
def name(x):
    Name=x
    if Name=='Mr.':
        return 'Mr'
    elif Name=='Miss.':
        return 'Miss'
    elif Name=='Mrs.':
        return 'Mrs'
    else:
        return 'other'

In [None]:
train_df['Name']=train_df['Name'].map(lambda x: x.split(' ')[1])

In [None]:
train_df['Name']=train_df['Name'].apply(name)

In [None]:
test_df['Name']=test_df['Name'].map(lambda x: x.split(' ')[1])

In [None]:
test_df['Name']=test_df['Name'].apply(name)

In [None]:
train_df.info()
print('_'*40)
test_df.info()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
nametrain=pd.get_dummies(train_df['Name'],drop_first=True)
nametest=pd.get_dummies(test_df['Name'],drop_first=True)

In [None]:
embarkedtrain=pd.get_dummies(train_df['Embarked'],drop_first=True)
embarkedtest=pd.get_dummies(test_df['Embarked'],drop_first=True)

In [None]:
pclasstrain=pd.get_dummies(train_df['Pclass'],drop_first=True)
pclasstest=pd.get_dummies(test_df['Pclass'],drop_first=True)

In [None]:
tr_df=pd.concat([train_df,nametrain,embarkedtrain,pclasstrain],axis=1)
te_df=pd.concat([test_df,nametest,embarkedtest,pclasstest],axis=1)

In [None]:
te_df.drop(['Name','Embarked','Pclass','Ticket'],axis=1,inplace=True)
tr_df.drop(['Name','Embarked','Pclass','Ticket'],axis=1,inplace=True)

In [None]:
tr_df.head()

In [None]:
#Applying Mean Normalization to both datasets

In [None]:
tr_df['Age']=(tr_df['Age']-tr_df['Age'].mean())/(tr_df['Age'].max()-tr_df['Age'].min())
tr_df['Fare']=(tr_df['Fare']-tr_df['Fare'].mean())/(tr_df['Fare'].max()-tr_df['Fare'].min())
tr_df.head()

In [None]:
te_df['Age']=(te_df['Age']-te_df['Age'].mean())/(te_df['Age'].max()-te_df['Age'].min())
te_df['Fare']=(te_df['Fare']-te_df['Fare'].mean())/(te_df['Fare'].max()-te_df['Fare'].min())
tr_df.head()

In [None]:
x_train=tr_df[['Sex', 'Age', 'SibSp', 'Parch','Fare', 'Mr', 'Mrs', 'other', 'Q', 'S',2,3]]
y_train=tr_df['Survived']

In [None]:
x_test=te_df[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Mr','Mrs', 'other', 'Q', 'S',2,3]]

In [None]:
#Data looks clean and nice, it's time for the model training.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


#LogisticRegression

In [None]:
lr=LogisticRegression()

In [None]:
lr.fit(x_train,y_train)

In [None]:
lr.score(x_train,y_train)

#SVM

In [None]:
svc=SVC()

In [None]:
svc.fit(x_train,y_train)

In [None]:
svc.score(x_train,y_train)

#Random Forest

In [None]:
rnf=RandomForestClassifier()

In [None]:
rnf.fit(x_train,y_train)

In [None]:
rnf.score(x_train,y_train)

In [None]:
a