### Step 1:: Importing the basic packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Reading the data set 

dataset=pd.read_csv("titanic.csv")
dataset.head()  # to get top five rows in a data set

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataset.info()  # to get all the information about the data set

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
dataset.shape   # to get no of rows and columns in the dataset

(891, 12)

In [5]:
dataset.dtypes   # to get the data types of all the columns present in the dataset

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
#drop the unwanted data columns present in the data set
dataset.drop(['Name'],axis=1,inplace=True)
dataset.drop(['Fare','Cabin','Ticket'],axis=1,inplace=True)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Embarked     889 non-null    object 
dtypes: float64(1), int64(5), object(2)
memory usage: 55.8+ KB


In [8]:
# Combining siblings and parents

dataset['Family']=dataset['SibSp']+dataset['Parch']+1

In [9]:
dataset.drop(['SibSp','Parch'],axis=1,inplace=True)

In [10]:
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'Embarked',
       'Family'],
      dtype='object')

In [11]:
dataset.describe()  # gives statistical information about the dataset

Unnamed: 0,PassengerId,Survived,Pclass,Age,Family
count,891.0,891.0,891.0,714.0,891.0
mean,446.0,0.383838,2.308642,29.699118,1.904602
std,257.353842,0.486592,0.836071,14.526497,1.613459
min,1.0,0.0,1.0,0.42,1.0
25%,223.5,0.0,2.0,20.125,1.0
50%,446.0,0.0,3.0,28.0,1.0
75%,668.5,1.0,3.0,38.0,2.0
max,891.0,1.0,3.0,80.0,11.0


In [12]:
# Checking for null values in the dataset
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
Embarked         2
Family           0
dtype: int64

In [13]:
# Replacing the null values in age column by its mean

dataset["Age"].fillna(dataset["Age"].mean(),inplace=True)

In [14]:
dataset.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
Embarked       2
Family         0
dtype: int64

In [15]:
dataset["Embarked"].value_counts()  # to find all the unique values in the dataset

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [16]:
dataset["Embarked"].fillna("S",inplace=True)

In [17]:
dataset.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
Embarked       0
Family         0
dtype: int64

In [18]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,Family
0,1,0,3,male,22.0,S,2
1,2,1,1,female,38.0,C,2
2,3,1,3,female,26.0,S,1
3,4,1,1,female,35.0,S,2
4,5,0,3,male,35.0,S,1


In [19]:
dataset

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,Family
0,1,0,3,male,22.000000,S,2
1,2,1,1,female,38.000000,C,2
2,3,1,3,female,26.000000,S,1
3,4,1,1,female,35.000000,S,2
4,5,0,3,male,35.000000,S,1
...,...,...,...,...,...,...,...
886,887,0,2,male,27.000000,S,1
887,888,1,1,female,19.000000,S,1
888,889,0,3,female,29.699118,S,4
889,890,1,1,male,26.000000,C,1


In [20]:
dataset.Embarked.replace(['S', 'C', 'Q'], [1, 2, 3], inplace=True)

In [21]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,Family
0,1,0,3,male,22.0,1,2
1,2,1,1,female,38.0,2,2
2,3,1,3,female,26.0,1,1
3,4,1,1,female,35.0,1,2
4,5,0,3,male,35.0,1,1


In [22]:
dataset.Sex.replace(['male','female'],[1,0],inplace=True)

In [23]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,Family
0,1,0,3,1,22.0,1,2
1,2,1,1,0,38.0,2,2
2,3,1,3,0,26.0,1,1
3,4,1,1,0,35.0,1,2
4,5,0,3,1,35.0,1,1


### Finding the optimal k using cross validation (simple cross validation and k-fold cross validation)

In [24]:
#importing the libraries required

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from collections import Counter
from sklearn.model_selection import cross_validate

In [27]:
# independent variables
X=np.array(dataset.filter(['Pclass','Sex','Age','Embarked','Family'],axis=1)) 
print(X)

[[ 3.          1.         22.          1.          2.        ]
 [ 1.          0.         38.          2.          2.        ]
 [ 3.          0.         26.          1.          1.        ]
 ...
 [ 3.          0.         29.69911765  1.          4.        ]
 [ 1.          1.         26.          2.          1.        ]
 [ 3.          1.         32.          3.          1.        ]]


In [28]:
y=np.array(dataset.filter(['Survived'],axis=1))
print(y)

[[0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]


In [71]:
# splitting the dataset into train and test dataset

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [72]:
print("size of X training dataset:",X_train.shape)
print("size of X testing dataset:",X_test.shape)
print("size of y training dataset:",y_train.shape)
print("size of y testing dataset",y_test.shape)

size of X training dataset: (623, 5)
size of X testing dataset: (268, 5)
size of y training dataset: (623, 1)
size of y testing dataset (268, 1)


In [73]:
#Splitting the train dataset into train and validation dataset

X_train_1,X_val,y_train_1,y_val=train_test_split(X_train,y_train,test_size=0.3,random_state=0)

In [74]:
print("size of actual X training dataset: ",X_train_1.shape)
print("size of  X validation dataset: ",X_val.shape)
print("size of actual y training dataset: ",y_train_1.shape)
print("size of  y validation  dataset: ",y_val.shape)


size of actual X training dataset:  (436, 5)
size of  X validation dataset:  (187, 5)
size of actual y training dataset:  (436, 1)
size of  y validation  dataset:  (187, 1)


### Getting the best k using simple validation

In [75]:
import warnings
warnings.filterwarnings('ignore')

final_scores=[]
for i in range(1,10):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_1,y_train_1)
    pred=knn.predict(X_val)
    acc=accuracy_score(y_val,pred,normalize=True)*float(100)
    final_scores.append(acc)

In [76]:
optimal_k=final_scores.index(max(final_scores))

In [77]:
optimal_k

# Here we consider the optimal value to be 7 as the indices start from zero

2

In [88]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_1,y_train_1)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [89]:
predict=knn.predict(X_test)
print(np.concatenate((y_test.reshape(len(y_test),1),predict.reshape(len(predict),1)),1))

[[0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]

In [90]:
from sklearn.metrics import accuracy_score
acc=accuracy_score(y_test,predict)
print("Accuacy Score:",acc)

Accuacy Score: 0.7388059701492538


In [91]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predict)
print("The confusion matrix is:")
print(cm)

The confusion matrix is:
[[138  30]
 [ 40  60]]


In [94]:
from sklearn.metrics import classification_report
report=classification_report(y_test,predict)
print(report)

              precision    recall  f1-score   support

           0       0.78      0.82      0.80       168
           1       0.67      0.60      0.63       100

    accuracy                           0.74       268
   macro avg       0.72      0.71      0.71       268
weighted avg       0.73      0.74      0.74       268

