## **Mounting google drive and downloading the dataset**

Mounting google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


For uploading kaggle api file

In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"pankazdebnath","key":"428092a1cecb8d5107c995f0f7551b87"}'}

Save the file on directory

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

kaggle.json


Viewing kaggle competitions list

In [5]:
!kaggle competitions list

ref                                                         deadline             category            reward  teamCount  userHasEntered  
----------------------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
mpetitions/contradictory-my-dear-watson                     2030-07-01 23:59:00  Getting Started     Prizes         42           False  
mpetitions/gan-getting-started                              2030-07-01 23:59:00  Getting Started     Prizes        105           False  
mpetitions/store-sales-time-series-forecasting              2030-06-30 23:59:00  Getting Started  Knowledge       1386           False  
mpetitions/tpu-getting-started                              2030-06-03 23:59:00  Getting Started  Knowledge        180           False  
mpetitions/digit-recognizer                                 2030-01-01 00:00:00  Getting Started  Knowledge       2148           False  
mpetitions/titanic                       

Downloading the data of titanic

In [6]:
!kaggle competitions download -c titanic

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 26.2MB/s]


In [7]:
#Unzip the data directory
from zipfile import ZipFile

with ZipFile('titanic.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

# Data Analysis and ML Model Creation

Importing necessary libraries

In [8]:
import pandas as pd                #for analysis 
import numpy as np                 #for numerical operation and creating categorical features dummy
import matplotlib                  #for data visualization
import matplotlib.pyplot as plt    #for data visualization
%matplotlib inline

import seaborn as sns              #for data visualization

matplotlib.rcParams["figure.figsize"] = (20,10)

In [9]:
train_df = pd.read_csv("train.csv")     #importing train data
test_df = pd.read_csv("test.csv")       #importing test data

In [10]:
train_df.head()     #View the train dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
test_df.head()      #View the test dataset

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [12]:
train_df.isnull().sum()     #finding total null values according to column in train dataset

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [13]:
test_df.isnull().sum()      #finding total null values according to column in test dataset

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [14]:
train_df.info()     #Check the info of train datades

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [15]:
test_df.info()      #Check the info of test datades

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [16]:
#fill null values of Cabin column with N
train_df["Cabin"].fillna("N", inplace=True)
test_df["Cabin"].fillna("N", inplace=True)

In [17]:
#defining the Cabin with the first letter of cabin in train dataset
for i in train_df["Cabin"]:
  train_df["Cabin"].replace(to_replace =i, value =i[0], inplace=True)

In [18]:
#defining the Cabin with the first letter of cabin in test dataset
for i in test_df["Cabin"]:
  test_df["Cabin"].replace(to_replace =i, value =i[0], inplace=True)

In [19]:
#replacing "N" with name "Deck" in train dataset
for i in train_df["Cabin"]:
  if i == "N":
    train_df["Cabin"].replace(to_replace ="N", value ="Deck", inplace=True)

In [20]:
#replacing "N" with name "Deck" in test dataset
for i in test_df["Cabin"]:
  if i == "N":
    test_df["Cabin"].replace(to_replace ="N", value ="Deck", inplace=True)

In [21]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Deck,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Deck,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Deck,S


In [22]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Deck,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,Deck,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Deck,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,Deck,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,Deck,S


In [23]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         2
dtype: int64

In [24]:
test_df.isnull().sum()

PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Cabin           0
Embarked        0
dtype: int64

In [25]:
#defining function for finding family menber onboard
def family_member(member):
    SibSp,Parch = member
    
    if SibSp + Parch > 0:
        return "Yes"
    else:
        return "No"

In [26]:
#defining function for classifying "child" according to age
def male_female_child(passenger):
    Age,Sex = passenger
    if Age < 16:
        return "child"
    else:
        return Sex

In [27]:
#applying function for finding family member with new column in both dataset
train_df["F_ob"] = train_df[["SibSp","Parch"]].apply(family_member,axis=1)

test_df["F_ob"] = test_df[["SibSp","Parch"]].apply(family_member,axis=1)

#applying function for defining child and gender in new column
train_df["Person"] = train_df[["Age","Sex"]].apply(male_female_child,axis=1)

test_df["Person"] = test_df[["Age","Sex"]].apply(male_female_child,axis=1)

In [28]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,F_ob,Person
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Deck,S,Yes,male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C,Yes,female
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Deck,S,No,female
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S,Yes,female
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Deck,S,No,male


In [29]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,F_ob,Person
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Deck,Q,No,male
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,Deck,S,Yes,female
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Deck,Q,No,male
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,Deck,S,No,male
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,Deck,S,Yes,female


In [30]:
#filling null entries of "Embarked" with mode of column
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])

#filling null entries of "Fare" with mean value of column
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

In [31]:
#droping columns "Name", "PassengerId", "Ticket","Age"
train_df2 = train_df.drop(["Name", "PassengerId", "Ticket","Age"], axis=1)
test_df2 = test_df.drop(["Name", "PassengerId", "Ticket","Age"], axis=1)

In [32]:
train_df2.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Cabin,Embarked,F_ob,Person
0,0,3,male,1,0,7.25,Deck,S,Yes,male
1,1,1,female,1,0,71.2833,C,C,Yes,female
2,1,3,female,0,0,7.925,Deck,S,No,female
3,1,1,female,1,0,53.1,C,S,Yes,female
4,0,3,male,0,0,8.05,Deck,S,No,male


In [33]:
test_df2.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Cabin,Embarked,F_ob,Person
0,3,male,0,0,7.8292,Deck,Q,No,male
1,3,female,1,0,7.0,Deck,S,Yes,female
2,2,male,0,0,9.6875,Deck,Q,No,male
3,3,male,0,0,8.6625,Deck,S,No,male
4,3,female,1,1,12.2875,Deck,S,Yes,female


In [34]:
#searching categorical values from "Cabin" in train dataset
train_df2['Cabin'].value_counts()

Deck    687
C        59
B        47
D        33
E        32
A        15
F        13
G         4
T         1
Name: Cabin, dtype: int64

In [35]:
#searching categorical values from "Cabin" in test dataset
test_df2['Cabin'].value_counts()

Deck    327
C        35
B        18
D        13
E         9
F         8
A         7
G         1
Name: Cabin, dtype: int64

In [36]:
#droping rows showing anomaly in train dataset
train_df2 = train_df2[train_df2.Cabin != "T"]

In [37]:
#defining categorical features and represent them with dummy variables
features = ["Sex", "Cabin", "F_ob", "Person", "Embarked"]
train_dummies = pd.get_dummies(train_df2[features])
test_dummies = pd.get_dummies(test_df2[features])


In [38]:
train_dummies.head()

Unnamed: 0,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_Deck,Cabin_E,Cabin_F,Cabin_G,F_ob_No,F_ob_Yes,Person_child,Person_female,Person_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0
2,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1
3,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1
4,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1


In [39]:
test_dummies.head()

Unnamed: 0,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_Deck,Cabin_E,Cabin_F,Cabin_G,F_ob_No,F_ob_Yes,Person_child,Person_female,Person_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0
1,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1
2,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0
3,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1
4,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1


In [40]:
#concat train dummies with train dataset
train_df3 = pd.concat([train_df2, train_dummies], axis=1)
train_df3.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Cabin,Embarked,F_ob,Person,...,Cabin_F,Cabin_G,F_ob_No,F_ob_Yes,Person_child,Person_female,Person_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,male,1,0,7.25,Deck,S,Yes,male,...,0,0,0,1,0,0,1,0,0,1
1,1,1,female,1,0,71.2833,C,C,Yes,female,...,0,0,0,1,0,1,0,1,0,0
2,1,3,female,0,0,7.925,Deck,S,No,female,...,0,0,1,0,0,1,0,0,0,1
3,1,1,female,1,0,53.1,C,S,Yes,female,...,0,0,0,1,0,1,0,0,0,1
4,0,3,male,0,0,8.05,Deck,S,No,male,...,0,0,1,0,0,0,1,0,0,1


In [41]:
#concat test dummies with test dataset
test_df3 = pd.concat([test_df2, test_dummies], axis=1)
test_df3.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Cabin,Embarked,F_ob,Person,Sex_female,...,Cabin_F,Cabin_G,F_ob_No,F_ob_Yes,Person_child,Person_female,Person_male,Embarked_C,Embarked_Q,Embarked_S
0,3,male,0,0,7.8292,Deck,Q,No,male,0,...,0,0,1,0,0,0,1,0,1,0
1,3,female,1,0,7.0,Deck,S,Yes,female,1,...,0,0,0,1,0,1,0,0,0,1
2,2,male,0,0,9.6875,Deck,Q,No,male,0,...,0,0,1,0,0,0,1,0,1,0
3,3,male,0,0,8.6625,Deck,S,No,male,0,...,0,0,1,0,0,0,1,0,0,1
4,3,female,1,1,12.2875,Deck,S,Yes,female,1,...,0,0,0,1,0,1,0,0,0,1


In [42]:
#droping columns which are using dummy
train_df4 = train_df3.drop(["Sex", "Cabin", "F_ob", "Person", "Embarked"], axis=1)
train_df4.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,...,Cabin_F,Cabin_G,F_ob_No,F_ob_Yes,Person_child,Person_female,Person_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,0,7.25,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,1
1,1,1,1,0,71.2833,1,0,0,0,1,...,0,0,0,1,0,1,0,1,0,0
2,1,3,0,0,7.925,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1
3,1,1,1,0,53.1,1,0,0,0,1,...,0,0,0,1,0,1,0,0,0,1
4,0,3,0,0,8.05,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1


In [43]:
#droping columns which are using dummy
test_df4 = test_df3.drop(["Sex", "Cabin", "F_ob", "Person", "Embarked"], axis=1)
test_df4.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,...,Cabin_F,Cabin_G,F_ob_No,F_ob_Yes,Person_child,Person_female,Person_male,Embarked_C,Embarked_Q,Embarked_S
0,3,0,0,7.8292,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
1,3,1,0,7.0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
2,2,0,0,9.6875,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
3,3,0,0,8.6625,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
4,3,1,1,12.2875,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1


In [44]:
#cross checking the null values of train datadet
train_df4.isnull().sum()

Survived         0
Pclass           0
SibSp            0
Parch            0
Fare             0
Sex_female       0
Sex_male         0
Cabin_A          0
Cabin_B          0
Cabin_C          0
Cabin_D          0
Cabin_Deck       0
Cabin_E          0
Cabin_F          0
Cabin_G          0
F_ob_No          0
F_ob_Yes         0
Person_child     0
Person_female    0
Person_male      0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
dtype: int64

In [45]:
#cross checking the null values of test dataset
test_df4.isnull().sum()

Pclass           0
SibSp            0
Parch            0
Fare             0
Sex_female       0
Sex_male         0
Cabin_A          0
Cabin_B          0
Cabin_C          0
Cabin_D          0
Cabin_Deck       0
Cabin_E          0
Cabin_F          0
Cabin_G          0
F_ob_No          0
F_ob_Yes         0
Person_child     0
Person_female    0
Person_male      0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
dtype: int64

In [46]:
#extracting training set from train dataset
X = train_df4.drop("Survived", axis=1)
X.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,...,Cabin_F,Cabin_G,F_ob_No,F_ob_Yes,Person_child,Person_female,Person_male,Embarked_C,Embarked_Q,Embarked_S
0,3,1,0,7.25,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
1,1,1,0,71.2833,1,0,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
2,3,0,0,7.925,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1
3,1,1,0,53.1,1,0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
4,3,0,0,8.05,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1


In [47]:
#extracting outputs from train dataset
y = train_df4["Survived"]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [48]:
#spliting X for training and test and check the shape
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(712, 22) (178, 22) (712,) (178,)


In [50]:
#training model with DecisionTreeClassifier algorithm
from sklearn.tree import DecisionTreeClassifier
DTC_model = DecisionTreeClassifier()
DTC_model.set_params(criterion='entropy',
                     splitter='random',
                     max_depth = 10,
                     min_samples_split=10,
                     min_samples_leaf=5)
DTC_model.fit(X_train,y_train)              #fitting train tada into the model
DTC_model.score(X_test,y_test)              #generating the accuracy score

0.7865168539325843

In [52]:
predictions = DTC_model.predict(test_df4)       #predicting the outputs for test dataset

#store the predicted results in a dataframe
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('DTC_prediction.csv', index=False)        #export results as csv file