In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
DIR=r'C:\Users\Nikhil Gupta\Downloads'

In [3]:
df = pd.read_csv(DIR+'/train.csv',header=0)

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


* We see that data entries is missing in columns "Age", "Cabin" and "Embarked". We need to deal with this.
* Also, there are columns which don't have numeric data. We need to modify them as well.
* We delete the columns "Name","Ticket" and "Fare" as they are irrelevant to our analysis.

In [6]:
cols = ['Name','Ticket','Cabin']
df = df.drop(cols,axis=1)

In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


We use interpolate() function to fill the missing values in "Age"

In [8]:
df['Age'] = df['Age'].interpolate()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB


* We now delete all those rows that do have missing values.

In [10]:
df = df.dropna()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Sex            889 non-null object
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Fare           889 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 69.5+ KB


* We see that now we don't have any NaN values. Let's deal with the data types issue now.

In [12]:
dummies = []
cols = ['Sex','Embarked']
for col in cols:
 dummies.append(pd.get_dummies(df[col]))

In [13]:
titanic_dummies = pd.concat(dummies, axis=1)

We now concatenate the newly obtained columns to our dataframe

In [14]:
df = pd.concat((df,titanic_dummies),axis=1)

We now drop the initial columns that are now required

In [15]:
df = df.drop(['Sex','Embarked'],axis=1)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Fare           889 non-null float64
female         889 non-null uint8
male           889 non-null uint8
C              889 non-null uint8
Q              889 non-null uint8
S              889 non-null uint8
dtypes: float64(2), int64(5), uint8(5)
memory usage: 59.9 KB


## Now that we have processed the data, we begin analysing it with ML techniques 

Now we convert our dataframe from pandas to numpy and also assign input and output

In [17]:
X = df.values
y = df['Survived'].values

We now delete the survived values in X

In [18]:
X = np.delete(X,1,axis=1)

### We now use Decision Trees

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
Tree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
Tree # it shows the default parameters

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
Tree.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

We now see how our model works on the Test dataset

In [22]:
df1 = pd.read_csv(DIR+'/test.csv',header=0)

We preprocess the Test dataset as we did for the Training dataset

In [23]:
cols1 = ['Name','Ticket','Cabin']
df1 = df1.drop(cols1,axis=1)

In [24]:
df1['Age'] = df1['Age'].interpolate()

In [25]:
df1['Fare'] = df1['Fare'].interpolate()

In [26]:
df1 = df1.dropna()

In [27]:
dummies1 = []
cols1 = ['Sex','Embarked']
for col1 in cols1:
 dummies1.append(pd.get_dummies(df1[col1]))

In [28]:
titanic_dummies1 = pd.concat(dummies1, axis=1)

In [29]:
df1 = pd.concat((df1,titanic_dummies1),axis=1)

In [30]:
df1 = df1.drop(['Sex','Embarked'],axis=1)

In [31]:
X1 = df1.values

We now predict how we performed using our model, on the test dataset

In [32]:
y1 = Tree.predict(X1)

Let's see which variables play a significant role in the survival.

In [33]:
Tree.feature_importances_

array([0.03493587, 0.16785241, 0.10808475, 0.02615911, 0.02098895,
       0.12095311, 0.        , 0.51056346, 0.        , 0.        ,
       0.01046234])

We observe that gender plays the most important role as its importance is roughly 0.51. This makes sense as females were preferred during rescue. The next most important role is Pclass. This is also apparent as rich people might have been preferred during rescue.

We now create a csv file to test our model on Kaggle

In [34]:
output = np.column_stack((X1[:,0],y1))
df_results = pd.DataFrame(output.astype('int'),columns=['PassengerID','Survived'])
df_results.to_csv('titanic_results.csv',index=False)

On submitting this to Kaggle, we get an accuracy of 0.7703.

### Let's try Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X,y)
LR

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [44]:
yhat = LR.predict(X1)

In [37]:
output2 = np.column_stack((X1[:,0],yhat))
df_results2 = pd.DataFrame(output2.astype('int'),columns=['PassengerID','Survived'])
df_results2.to_csv('titanic_results2.csv',index=False)

On uploading this to Kaggle, we get an accuracy of only 0.74162!

Now, let's see if we can increase the accuracy using Random forests.

In [42]:
from sklearn import ensemble
Tree1 = ensemble.RandomForestClassifier(n_estimators=100)
Tree1.fit (X, y)
yhat2 = Tree1.predict (X1)

In [43]:
output3 = np.column_stack((X1[:,0],yhat2))
df_results3 = pd.DataFrame(output3.astype('int'),columns=['PassengerID','Survived'])
df_results3.to_csv('titanic_results3.csv',index=False)

We obtain an accuracy of 0.76555 which is still not an improvement.

Let's now see if Gradient boosting algorithm can be of any help..

In [46]:
Tree2 = ensemble.GradientBoostingClassifier()
Tree2.fit (X, y)
yhat3 = Tree2.predict (X1)

In [47]:
output4 = np.column_stack((X1[:,0],yhat3))
df_results4 = pd.DataFrame(output4.astype('int'),columns=['PassengerID','Survived'])
df_results4.to_csv('titanic_results4.csv',index=False)

Sadly, this does not help as well as we get a score of 0.77033.

In [48]:
Tree3 = ensemble.GradientBoostingClassifier(n_estimators=50)
Tree3.fit(X, y)
yhat4 = Tree3.predict(X1)

In [49]:
output5 = np.column_stack((X1[:,0],yhat4))
df_results5 = pd.DataFrame(output5.astype('int'),columns=['PassengerID','Survived'])
df_results5.to_csv('titanic_results5.csv',index=False)

### Finally, we have made some improvement with a score of 0.77511!