#                       Detailed and In-depth analysis of Titanic [0.85074]

In [None]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd
import math 
# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.describe()

In [None]:
df = train.copy()

In [None]:
df.head()

In [None]:
# from train.describe() it is evident that only 38.38 % of the population on the ship survived , rest died

In [None]:
df.loc[(df.Survived == 1) & (df.Sex == "male") , :].count()

In [None]:
# there were 109 males across the ship who survived that accident

In [None]:
df.loc[(df.Survived == 1) & (df.Sex == "female") , :].count()

In [None]:
# there were 233 females across the ship who survived that accident
# look the following graph

In [None]:
sns.factorplot(x="Sex",col="Survived", data=df , kind="count",size=4, aspect=.7);

In [None]:
# this gives us the idea that males died more and females survived more

In [None]:
# similarly

In [None]:
sns.factorplot(x="Sex", hue = "Pclass" , col="Survived", data=df , kind="count",size=6, aspect=.7);

#### overall the males and females of Pclass 3 died more than others
#### the males of Pclass 3 showed a remarkable increase in death and shoots the graph up , same goes to the females in
#### same goes to the females in survived = 0
#### in survived = 0 , showing increasing trend in death as class shifts down

#### In survived = 1 females showed a near fall down trend as expected but pclass=2 females survived less than the Pclass=3 females

#### But the males on contrary showed a dip in between i.e. 
#### in males who survived , Plass -->  3 > 1 > 2


#### i.e Survived Pclass=3 males survived more than the survived Pclass=1 males and survived Pclass=2 males
#### the above is evident from the following inspection
#### although survived male Plass = 3 is slightly greater than survived male Plass = 1

In [None]:
df.loc[(df.Survived == 1) & (df.Sex == "male") & (df.Pclass == 1)].count()

In [None]:
df.loc[(df.Survived == 1) & (df.Sex == "male") & (df.Pclass == 2) , :].count()

In [None]:
df.loc[(df.Survived == 1) & (df.Sex == "male") & (df.Pclass == 3) , :].count()

In [None]:
pd.crosstab(df.Pclass, df.Survived, margins=True).style.background_gradient(cmap='autumn_r')

In [None]:
# All in all including both the sexes 2nd class survived less than the other two clases

In [None]:
df.Survived[df.Pclass == 1].sum()/df[df.Pclass == 1].Survived.count()

In [None]:
df.Survived[df.Pclass == 2].sum()/df[df.Pclass == 2].Survived.count()

In [None]:
df.Survived[df.Pclass == 3].sum()/df[df.Pclass == 3].Survived.count()

In [None]:
# % survived in Pclass = 1  --> 62.96 %  , similarly calculated for others

In [None]:
sns.factorplot(x='Pclass',y='Survived', kind="point" ,data=df)

In [None]:
sns.factorplot('Pclass','Survived',kind="bar",hue='Sex',data=df)

In [None]:
# A cross-tabulation to further inspect

In [None]:
pd.crosstab([df.Sex, df.Survived], df.Pclass, margins=True).style.background_gradient(cmap='autumn_r')

In [None]:
# Almost all women in Pclass 1 and 2 survived and nearly all men in Pclass 2 and 3 died

In [None]:
# lets see how survivals varies with Embarked

In [None]:
sns.factorplot(x="Survived",col="Embarked",data=df ,hue="Pclass", kind="count",size=5, aspect=.7);

In [None]:
# this shows that those who were embarked S survived more than those who were survived C and then Q
# Most of the people who died were embarked S

In [None]:
# Also , people survived with embarked Q were mostly from Plass 3 females

In [None]:
# A more closer look with cross-tab

In [None]:
pd.crosstab([df.Survived], [df.Sex, df.Pclass, df.Embarked], margins=True).style.background_gradient(cmap='autumn_r')

In [None]:
# can also be viewed like this

In [None]:
plt.subplots(figsize = (10,5))
plt.title('Embarked vs Survived wih Sex')
sns.violinplot(x = "Survived", y = "Embarked", hue = "Sex",data = df)
plt.show()

In [None]:
# similarly with Pclass

sns.factorplot(x = "Survived", y = "Pclass",col = "Embarked" , hue = "Sex" , kind = "violin",data = df)


In [None]:
sns.factorplot(x="Sex", y="Survived",col="Embarked",data=df ,hue="Pclass",kind="bar",size=5, aspect=.7);

In [None]:
# Inferences from above graph

# the survived axis shows the % .
# which means embarked Q males in Pclass 1 and 2 were all died

# while embarked females in Pclass 1 and 2 all lived....
# also nearly Pclass 1 and 2 females of all embarked types lived

In [None]:
context1 = {"female":0 , "male":1}
context2 = {"S":0 , "C":1 , "Q":2}
df['Sex_bool']=df.Sex.map(context1)
df["Embarked_bool"] = df.Embarked.map(context2)

In [None]:
df.head()

In [None]:
correlation_map = df[['PassengerId', 'Survived', 'Pclass', 'Sex_bool', 'Age', 'SibSp',
       'Parch', 'Fare' , 'Embarked_bool']].corr()
obj = np.array(correlation_map)
obj[np.tril_indices_from(obj)] = False
fig,ax= plt.subplots()
fig.set_size_inches(12,12)
sns.heatmap(correlation_map, mask=obj,vmax=.7, square=True,annot=True)

### The above heatmap shows the overall picture very clearly 

###  PassengerId is a redundant column as its very much less related to all other attributes , we can remove it .

###  Also , Survived is related indirectly with Pclass and also we earlier proved that as Pclass value increases Survival decreases

###  Pclass and Age are also inversely related and can also be proven by the following cell  that as Pclass decreases , the mean of the Age increases ,  means the much of the older travellers are travelling in high class .
              
###  Pclass and fare are also highly inversely related as the fare of Pclass 1 would obviously be higher than corresponding Pclass 2 and 3 .
###  Also , people with lower ages or children are travelling with their sibling and parents more than higher aged people (following an inverse relation) , which is quite a bit obvious .
###  Parch and SibSp are also highly directly related
###  Sex_bool and Survived people are highly inversely related , i.e. females are more likely to survive than men

In [None]:
df.groupby("Pclass").Age.mean()


In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
for x in [train, test,df]:
    x['Age_bin']=np.nan
    for i in range(8,0,-1):
        x.loc[ x['Age'] <= i*10, 'Age_bin'] = i

In [None]:
df[["Age" , "Age_bin"]].head(10)

In [None]:
sns.factorplot('Age_bin','Survived', col='Pclass' , row = 'Sex',kind="bar", data=df)

In [None]:
sns.factorplot('Age_bin','Survived', col='Pclass' , row = 'Sex', kind="violin", data=df)

In [None]:
pd.crosstab([df.Sex, df.Survived], [df.Age_bin, df.Pclass], margins=True).style.background_gradient(cmap='autumn_r')

In [None]:
#  All female in Pclass 3 and Age_bin = 5 died.
#  Males in Age_bin >= 2 and Pclass died more than survived or died greater than 50% .

In [None]:
sns.factorplot('SibSp', 'Survived', col='Pclass' , row = 'Sex', data=df )

In [None]:
#  Females in Pclass 1 and 2 with siblings upto 3 nearly all survived

In [None]:
#  For Pclass 3 , males and females showed a near decreasing trend as number of siblings increased .

In [None]:
#  For males, no survival rate above 0.5 for any values of SibSp. (less than 50 %)

In [None]:
pd.crosstab([df.Sex, df.Survived], [df.Parch, df.Pclass], margins=True).style.background_gradient(cmap='autumn_r')

In [None]:
#  For males,all survival rates below 0.5 for any values of Parch, except for Parch = 2 and Pclass = 1.

In [None]:
sns.factorplot('Parch', 'Survived', col='Pclass' , row = 'Sex', kind="bar", data=df )

In [None]:
# the distribution of Age_bin , SibSp and Parch as follows

In [None]:
for x in [train, test , df]:
    x['Fare_bin']=np.nan
    for i in range(12,0,-1):
        x.loc[ df['Fare'] <= i*50, 'Fare_bin'] = i

In [None]:
fig, axes = plt.subplots(4,1)
fig.set_size_inches(20, 18)
sns.kdeplot(df.SibSp , shade=True, color="red" , ax= axes[0])
sns.kdeplot(df.Parch , shade=True, color="red" , ax= axes[1])
sns.kdeplot(df.Age_bin , shade=True, color="red" , ax= axes[2])
sns.kdeplot(df.Fare , shade=True, color="red" , ax= axes[3])
plt.show()

###  Maximum people are with no siblings travelling
###  more people were travelling with only their 1 parent rather than 2 
###  maximum population on the ship was aged between 15 yrs to 50 yrs.
###  most of the people only paid upto 50 as their fare

In [None]:
# introducing Fare_bin the same way as done in the Age_bin above but with a gap of 50

In [None]:
df[["Fare" , "Fare_bin"]].head(10)

In [None]:
pd.crosstab([df.Sex, df.Survived], [df.Fare_bin, df.Pclass], margins=True).style.background_gradient(cmap='autumn_r')

In [None]:
sns.factorplot('Fare_bin','Survived', col='Pclass' , row = 'Sex', data=df)
plt.show()

In [None]:
df_test = test.copy()

In [None]:
df_test.head()

In [None]:
df.drop(['PassengerId','Sex','Embarked','Name','Ticket', 'Cabin', 'Age', 'Fare'],axis=1,inplace=True)
df.head()

In [None]:
context1 = {"female":0 , "male":1}
context2 = {"S":0 , "C":1 , "Q":2}
df_test['Sex_bool']=df_test.Sex.map(context1)
df_test["Embarked_bool"] = df_test.Embarked.map(context2)
df_test.drop(['PassengerId','Sex','Embarked','Name','Ticket', 'Cabin', 'Age', 'Fare'],axis=1,inplace=True)
df_test.head()

In [None]:
df.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
#  Age_bin in both dataframes is still possessing null values

In [None]:
df_test.Age_bin.fillna(df_test.Age_bin.mean() , inplace=True)

In [None]:
df.Age_bin.fillna(df.Age_bin.mean() , inplace=True)

In [None]:
df.Embarked_bool.fillna(df.Embarked_bool.mean() , inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived',axis=1), df['Survived'], test_size=0.3, random_state=101)

In [None]:
MLA = []
x = [LinearSVC() , DecisionTreeClassifier() , LogisticRegression() , KNeighborsClassifier() , GaussianNB() ,
    RandomForestClassifier() , GradientBoostingClassifier()]

X = ["LinearSVC" , "DecisionTreeClassifier" , "LogisticRegression" , "KNeighborsClassifier" , "GaussianNB" ,
    "RandomForestClassifier" , "GradientBoostingClassifier"]

for i in range(0,len(x)):
    model = x[i]
    model.fit( X_train , y_train )
    pred = model.predict(X_test)
    MLA.append(accuracy_score(pred , y_test))

In [None]:
MLA

In [None]:
sns.kdeplot(MLA , shade=True, color="red")

In [None]:
#  this proves that much of the algorithms are giving the accuracy between 77 % to 80 % with some above 80 % .
#  thats a pretty much good estimation 

In [None]:
d = { "Accuracy" : MLA , "Algorithm" : X }
dfm = pd.DataFrame(d)

In [None]:
# making a dataframe of the list of accuracies calculated above

In [None]:
dfm   # a dataframe wilh all accuracies and their corresponding algorithm name

In [None]:
sns.barplot(x="Accuracy", y="Algorithm", data=dfm)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived',axis=1), df['Survived'], test_size=0.3, random_state=66)
model = KNeighborsClassifier(n_neighbors=6)
model.fit( X_train , y_train )

In [None]:
pred = model.predict(X_test)

In [None]:
answer = model.predict(df_test)

In [None]:
print (accuracy_score(pred , y_test))

# So , the accuracy turns out to be 85.074 % with n-neighbors = 6,
# lets check for other n-neighbors .

In [None]:
#  lets check it till 30 neighbours that which has got the maximum accuracy score

KNNaccu = []
Neighbours = []

for neighbour in range(1,31):
    model = KNeighborsClassifier(n_neighbors=neighbour)
    model.fit( X_train , y_train )
    pred = model.predict(X_test)
    KNNaccu.append(accuracy_score(pred , y_test))
    Neighbours.append(neighbour)

In [None]:
d = { "Neighbours" : Neighbours , "Accuracy" : KNNaccu }
knndf = pd.DataFrame(d)

In [None]:
knndf.head()

In [None]:
sns.factorplot(x="Neighbours", y="Accuracy",size = 5 , aspect = 2 , data=knndf)

###  This states that for Neighbours = 6 , the accuracy is the maximum  .

In [None]:
#  making a csv file of the predictions

In [None]:
d = { "PassengerId":test.PassengerId , "Survived":answer }
final = pd.DataFrame(d)
final.to_csv( 'titanic_again.csv' , index = False )

# Please upvote if you like it...