In [39]:
# TP 01 - MACHINE LEARNING - MAHDI ZEROUAK

In [40]:
# --------------- STEP 1 - LOAD DATA --------------- 

import pandas as pd

# load data into data variable
data = pd.read_csv('Titanic-Dataset.csv')

# print data info
data.info()
# ----- BTW --------- (definition of each variable)
# ** Survived = target variable ** (0 = did not survive) (1 = survived)
# Pclass = ticket class (1st class, 2nd class...etc)
# SibSp = Number of siblings/spouses aboard
# Parch = Number of parents/children aboard
# Fare = ticket price
# Embarked = Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

# PassangerID, Name, Sex, Ticket, Cabin, Age are obvious
# --------------------

<class 'pandas.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    str    
 4   Sex          891 non-null    str    
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    str    
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    str    
 11  Embarked     889 non-null    str    
dtypes: float64(2), int64(5), str(5)
memory usage: 83.7 KB


In [41]:
# print 5 first rows (sample) of our dataset
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# ---------------  STEP 2 - PREPROCESS THE DATA --------------- 

# since we have many unnecessary columns we're going to delete them from our data variable that contains the dataset
# columns that we don't need in our KNN model are [PassengerId, Name, Ticket, Cabin] because they're either just identifiers or text
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)

data.head(5)

In [None]:
# some values are not numbers so we can't compute distance between them (example in sex: male, female)
# solution is to convert these values to numbers

from sklearn.preprocessing import LabelEncoder

# we have two types of enconding

# 1 - label encoding for binary variables like sex: if male then 1 else if female then 0

# 2 - one hot encoding for variables with more than 2 categories, we going to create a new column for each category
# then assign value 1 if the row belongs to that category, else 0

# label encoding for "sex" 
# (1 for male and 0 for female)
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])

# One-hot encoding for "embarked"
# since we have 3 categories, we will create 3 columns
data = pd.get_dummies(data, columns=['Embarked'], dtype=int)

data.head(5)

In [None]:
# checking how many null values we have in our dataset
data.isnull().sum()

In [None]:
# we have 177 missed value in age

# we are going to fill them with a median of ages in our dataset
data['Age'] = data['Age'].fillna(data['Age'].median())

data.isnull().sum()

In [None]:
# ---------------  STEP 3 - SPLIT DATA --------------- 

# 70% train, 30% test
from sklearn.model_selection import train_test_split

x = data.drop('Survived', axis=1) 
y = data['Survived']

# split
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.3, # means 30% test and 70% train
    random_state=42, # if we don't specify random_state, each time we run the notebook we will get a different split
                     # which means different test/train data, different accuracy
    stratify=y # since only 38% survived our dataset is imbalanced, 
                # stratify=y will ensure test/train data will be splitted more reliably, not in a random way
)

In [None]:
# ---------------  GOING BACK TO STEP 2 - PREPROCESS THE DATA  --------------- 

# scales numeric features

# since KNN usually use Euclidean distance
# we have to scale because If one feature has a much larger scale (like Fare), it will dominate other variables when calculating
# which means the feature with the largest numeric range controls the model which is not logical 
# scaling ensures that all features contribute equally

# we will not touch binary variables like sex or embarked_c, embarked_q...etc because they can be only 0 or 1

# why we're back to step 2 after finishing with step 3 :
# if we scale on the whole dataset before splitting the data, the scaler can learn the mean of the entire dataset or some other stuff
# that includes our future test data, which is some kind of data leakage
# so CHATGPT said split the data before scaling it, then scale 


# in our case we will scale ('Age', 'Fare', 'SibSp', 'Parch')
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# scaling on train data
# fit_transform(): calculates mean/std AND immediately scales the same data (used in train data)
x_train[['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']] = scaler.fit_transform(x_train[['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']])

# scaling on test data
# transform() does NOT recalculate mean/std â€” it reuses the ones learned during fit_transform() (used in test data)
x_test[['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']] = scaler.transform(x_test[['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']])

x_train.head(5)

In [None]:
# ---------------  STEP 4 - TRAIN THE MODEL --------------- 

from sklearn.neighbors import KNeighborsClassifier

# create KNN model
# K = 3
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(x_train, y_train)

In [None]:
# predict labels for test set
y_pred = knn.predict(x_test)

# quick look at first 10 predictions
print(y_pred[:10])

In [None]:
# ---------------  STEP 5 - EVALUATE THE DATA --------------- 

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# detailed report
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# ------------------------------------------------------