In [160]:
# TP 01 - MACHINE LEARNING - MAHDI ZEROUAK

In [161]:
# --------------- STEP 1 - LOAD DATA --------------- 

import pandas as pd

# load data into data variable
data = pd.read_csv('Titanic-Dataset.csv')

# print data info
data.info()
# ----- BTW --------- (definition of each variable)
# ** Survived = target variable ** (0 = did not survive) (1 = survived)
# Pclass = ticket class (1st class, 2nd class...etc)
# SibSp = Number of siblings/spouses aboard
# Parch = Number of parents/children aboard
# Fare = ticket price
# Embarked = Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

# PassangerID, Name, Sex, Ticket, Cabin, Age are obvious

# variables that we don't need in our KNN model are [PassengerId, Name, Ticket, Cabin] because they're either just identifiers or text
# --------------------

# print 5 first rows (sample) of our dataset
data.head(5) 

<class 'pandas.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    str    
 4   Sex          891 non-null    str    
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    str    
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    str    
 11  Embarked     889 non-null    str    
dtypes: float64(2), int64(5), str(5)
memory usage: 83.7 KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [162]:
# ---------------  STEP 2 - PREPROCESS THE DATA --------------- 

# since we have many unnecessary columns we're going to delete them from our data variable that contains the dataset
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)

data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [163]:
# some values are not numbers so we can't compute distance between them (example in sex: male, female)
# solution is to convert these values to numbers

from sklearn.preprocessing import LabelEncoder

# we have two types of enconding

# 1 - label encoding for binary variables like sex: if male then 1 else if female then 0

# 2 - one hot encoding for variables with more than 2 categories, we going to create a new column for each category
# then assign value 1 if the row belongs to that category, else 0

# label encoding for "sex" 
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])

# One-hot encoding for "embarked"
data = pd.get_dummies(data, columns=['Embarked'], dtype=int)

data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [164]:
# checking how many null values we have in our dataset
data.isnull().sum()

Survived        0
Pclass          0
Sex             0
Age           177
SibSp           0
Parch           0
Fare            0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
dtype: int64

In [165]:
# we have 177 missed value in age

# we are going to fill them with a median of ages in our dataset
data['Age'] = data['Age'].fillna(data['Age'].median())

data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [166]:
# scales numeric features

# we will not touch binary variables like sex or embarked_c, embarked_q...etc because they can be only 0 or 1
# we have to scale because If one feature has a much larger scale (like Fare), it will dominate other variables when calculating
# scaling ensures that all features contribute equally

# in our case we will scale ('Age', 'Fare', 'SibSp', 'Parch')
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data[['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']] = scaler.fit_transform(data[['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']])

data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,1.0,1,0.271174,0.125,0.0,0.014151,0,0,1
1,1,0.0,0,0.472229,0.125,0.0,0.139136,1,0,0
2,1,1.0,0,0.321438,0.0,0.0,0.015469,0,0,1
3,1,0.0,0,0.434531,0.125,0.0,0.103644,0,0,1
4,0,1.0,1,0.434531,0.0,0.0,0.015713,0,0,1


In [167]:
# ---------------  STEP 3 - SPLIT DATA --------------- 


In [None]:
# ---------------  STEP 4 - TRAIN THE MODEL --------------- 

In [None]:
# ---------------  STEP 5 - EVALUATE THE DATA --------------- 