In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings(
    "ignore", category=DeprecationWarning
)  # to avoid deprecation warnings

In [13]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("src/titanic.csv")
print("...Done.")
print()

Loading dataset...
...Done.



In [14]:
# Basic stats
print(f"Number of rows: {dataset.shape[0]}\n")

# Affichage du dataset
print("Display of dataset: ")
display(dataset.head())
print()

# Statistiques de base
print("Basic statistics: ")
data_desc = dataset.describe(include="all")
display(data_desc)
print()

# Pourcentage des valeurs manquantes
print("Percentage of missing values: ")
missing_values = 100 * dataset.isnull().sum() / dataset.shape[0]
display(missing_values)

Number of rows: 891

Display of dataset: 


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



Basic statistics: 


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,



Percentage of missing values: 


PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [15]:
# Drop useless columns / columns with too many missing values
useless_cols = ["PassengerId", "Name", "Ticket", "Cabin"]

# suggestion chatgpt
print(f"Dropping columns: {', '.join(useless_cols)}...")
dataset = dataset.drop(useless_cols, axis=1)  # axis=1 indique que nous supprimons des colonnes
print("...Done.")
print(dataset.head())

Dropping columns: PassengerId, Name, Ticket, Cabin...
...Done.
   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S


In [16]:
# Separate target variable Y from features X
target_name = "Survived"

print("Separating labels from features...")
# Extraire la colonne cible
Y = dataset[target_name]
# Supprimer la colonne cible des caractéristiques
X = dataset.drop(target_name, axis=1)  # All columns are kept, except the target
print("...Done.")
print("Target (Y) preview:")
print(Y.head())
print()
print("Features (X) preview:")
print(X.head())
print()

Separating labels from features...
...Done.
Target (Y) preview:
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

Features (X) preview:
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0       3    male  22.0      1      0   7.2500        S
1       1  female  38.0      1      0  71.2833        C
2       3  female  26.0      0      0   7.9250        S
3       1  female  35.0      1      0  53.1000        S
4       3    male  35.0      0      0   8.0500        S



In [17]:
# Division du dataset en ensemble d'entraînement et de test
print("Dividing dataset into train and test sets...")

# 80% pour l'entraînement et 20% pour le test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=0)

# random_state garantit que les résultats sont reproductibles à chaque exécution
print(f"Train set size: {X_train.shape[0]} rows")
print(f"Test set size: {X_test.shape[0]} rows")
print("...Done.")
print()

Dividing dataset into train and test sets...
Train set size: 757 rows
Test set size: 134 rows
...Done.



In [18]:
X_train.head() # see where the numeric columns are in X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
545,1,male,64.0,0,0,26.0,S
37,3,male,21.0,0,0,8.05,S
214,3,male,,1,0,7.75,Q
40,3,female,40.0,1,0,9.475,S
236,2,male,44.0,1,0,26.0,S


In [19]:
# Create pipeline for numeric features
numeric_features = ["Pclass", "Age", "SibSp", "Parch", "Fare"]  # Names of numeric columns in X_train/X_test
numeric_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="mean"),
        ),  # missing values will be replaced by columns' median
        ("scaler", StandardScaler()),
    ]
)

In [20]:
# Create pipeline for categorical features
categorical_features = ["Sex", "Embarked"]  # Names of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="most_frequent"),
        ),  # missing values will be replaced by most frequent value
        (
            "encoder",
            OneHotEncoder(drop="first"),
        ),  # first column will be dropped to avoid creating correlations between features
    ]
)

In [21]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [22]:
# Prétraitements sur l'ensemble d'entraînement
print("Performing preprocessings on train set...")
print(X_train.head())

# Appliquer les transformations
X_train = preprocessor.fit_transform(X_train)
print("...Done.")
print(
    X_train[0:5]
)

# Prétraitements sur l'ensemble de test
print("Performing preprocessings on test set...")
print(X_test.head())

# Appliquer les transformations sur l'ensemble de test
X_test = preprocessor.transform(X_test)

print("...Done.")
print(
    X_test[0:5, :]
)  # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

Performing preprocessings on train set...
     Pclass     Sex   Age  SibSp  Parch    Fare Embarked
545       1    male  64.0      0      0  26.000        S
37        3    male  21.0      0      0   8.050        S
214       3    male   NaN      1      0   7.750        Q
40        3  female  40.0      1      0   9.475        S
236       2    male  44.0      1      0  26.000        S
...Done.
[[-1.60067161e+00  2.61131471e+00 -4.63468368e-01 -4.65997851e-01
  -1.09604554e-01  1.00000000e+00  0.00000000e+00  1.00000000e+00]
 [ 8.10688409e-01 -6.78358906e-01 -4.63468368e-01 -4.65997851e-01
  -4.71133941e-01  1.00000000e+00  0.00000000e+00  1.00000000e+00]
 [ 8.10688409e-01 -2.71796941e-16  4.31545801e-01 -4.65997851e-01
  -4.77176214e-01  1.00000000e+00  1.00000000e+00  0.00000000e+00]
 [ 8.10688409e-01  7.75217807e-01  4.31545801e-01 -4.65997851e-01
  -4.42433140e-01  0.00000000e+00  0.00000000e+00  1.00000000e+00]
 [-3.94991602e-01  1.08123396e+00  4.31545801e-01 -4.65997851e-01
  -1.0960

In [23]:
# Actually, X_train is a numpy array
type(X_train)

numpy.ndarray

In [25]:
# Train model
model = LogisticRegression()

print("Training model...")
model.fit(X_train, Y_train)  # Training is always done on train set !!
print("...Done.")

Training model...
...Done.


In [26]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = model.predict(X_train)
print("...Done.")
print(Y_train_pred[0:5])
print()

Predictions on training set...
...Done.
[0 0 0 0 0]



In [27]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = model.predict(X_test)
print("...Done.")
print(Y_test_pred[0:5])
print()

Predictions on test set...
...Done.
[0 0 0 1 1]



In [28]:
# Print scores
print("Accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))

# Always pass true label first, and predictions in second position
print("Accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

Accuracy on training set :  0.8018494055482166
Accuracy on test set :  0.7910447761194029


In [29]:
# Print scores
print("Accuracy on training set : ", model.score(X_train, Y_train))

# Here, the features must be passed first, and then the true label
print("Accuracy on test set : ", model.score(X_test, Y_test))

Accuracy on training set :  0.8018494055482166
Accuracy on test set :  0.7910447761194029
