In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings(
    "ignore", category=DeprecationWarning
)  # to avoid deprecation warnings

In [2]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("src/Dataset_preprocessing.csv")
print("...Done.")
print()

Loading dataset...
...Done.



In [3]:
# Basic stats
# old: print("Number of rows : {}".format(dataset.shape[0]))
print(f"Number of rows: {dataset.shape[0]}\n")

# Affichage du dataset
print("Display of dataset: ")
display(dataset.head())
print()

# Statistiques de base
print("Basic statistics: ")
# suggestion chatgpt mais sans instance => display(dataset.describe(include="all"))
data_desc = dataset.describe(include="all")
display(data_desc)
print()

# Pourcentage des valeurs manquantes
print("Percentage of missing values: ")
# old: display(100 * dataset.isnull().sum() / dataset.shape[0])
missing_values = 100 * dataset.isnull().sum() / dataset.shape[0]
display(missing_values)

Number of rows: 12

Display of dataset: 


Unnamed: 0,id,Country,Age,Salary,Purchased,useless_col,almost_empty
0,0,France,44.0,72000,No,useless,
1,1,Spain,27.0,48000,Yes,useless,40.0
2,2,Germany,30.0,54000,No,useless,
3,3,Spain,38.0,61000,No,useless,20.0
4,4,Germany,40.0,69000,Yes,useless,



Basic statistics: 


Unnamed: 0,id,Country,Age,Salary,Purchased,useless_col,almost_empty
count,12.0,12,11.0,12.0,12,12,2.0
unique,,3,,,2,1,
top,,France,,,Yes,useless,
freq,,5,,,7,12,
mean,5.5,,36.909091,83389580.0,,,30.0
std,3.605551,,19.002392,288657400.0,,,14.142136
min,0.0,,-10.0,32000.0,,,20.0
25%,2.75,,32.5,53500.0,,,25.0
50%,5.5,,38.0,64000.0,,,30.0
75%,8.25,,46.0,73750.0,,,35.0



Percentage of missing values: 


id               0.000000
Country          0.000000
Age              8.333333
Salary           0.000000
Purchased        0.000000
useless_col      0.000000
almost_empty    83.333333
dtype: float64

In [4]:
# Drop useless columns / columns with too many missing values
useless_cols = ["id", "useless_col", "almost_empty"]

# suggestion chatgpt
print(f"Dropping columns: {', '.join(useless_cols)}...")
dataset = dataset.drop(useless_cols, axis=1)  # axis=1 indique que nous supprimons des colonnes
print("...Done.")
print(dataset.head())

# cours
# print("Dropping useless columns...")
# dataset = dataset.drop(
#     useless_cols, axis=1
# )  # axis = 1 indicates that we are dropping along the column axis
# # never hesitate to look at a function's documentation using the command name_of_the_function?
# print("...Done.")
# print(dataset.head())

Dropping columns: id, useless_col, almost_empty...
...Done.
   Country   Age  Salary Purchased
0   France  44.0   72000        No
1    Spain  27.0   48000       Yes
2  Germany  30.0   54000        No
3    Spain  38.0   61000        No
4  Germany  40.0   69000       Yes


In [5]:
# Suppression des outliers pour l'âge
print("Dropping outliers in Age...")
to_keep = (dataset["Age"] > 0) | (dataset["Age"].isnull())  # Garde les âges positifs ou manquants
dataset = dataset.loc[to_keep, :]
print(f"Done. Number of lines remaining: {dataset.shape[0]}")
print()

# Jedha
# print("Dropping outliers in Age...")
# to_keep = (dataset["Age"] > 0) | (
#     dataset["Age"].isnull()
# )  # We want keeping positives values or missings
# dataset = dataset.loc[to_keep, :]
# print("Done. Number of lines remaining : ", dataset.shape[0])
# print()

print("Dropping outliers in Salary...")
to_keep = dataset["Salary"] < dataset["Salary"].mean() + 2 * dataset["Salary"].std()
dataset = dataset.loc[to_keep, :]
print("Done. Number of lines remaining : ", dataset.shape[0])
print()

dataset.head()

Dropping outliers in Age...
Done. Number of lines remaining: 11

Dropping outliers in Salary...
Done. Number of lines remaining :  10



Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000,No
1,Spain,27.0,48000,Yes
2,Germany,30.0,54000,No
3,Spain,38.0,61000,No
4,Germany,40.0,69000,Yes


In [6]:
# Separate target variable Y from features X
target_name = "Purchased"

print("Separating labels from features...")
# Extraire la colonne cible
Y = dataset[target_name]
# Supprimer la colonne cible des caractéristiques
X = dataset.drop(target_name, axis=1)  # All columns are kept, except the target
print("...Done.")
print("Target (Y) preview:")
print(Y.head())
print()
print("Features (X) preview:")
print(X.head())
print()

Separating labels from features...
...Done.
Target (Y) preview:
0     No
1    Yes
2     No
3     No
4    Yes
Name: Purchased, dtype: object

Features (X) preview:
   Country   Age  Salary
0   France  44.0   72000
1    Spain  27.0   48000
2  Germany  30.0   54000
3    Spain  38.0   61000
4  Germany  40.0   69000



In [7]:
# Division du dataset en ensemble d'entraînement et de test
print("Dividing dataset into train and test sets...")

# 80% pour l'entraînement et 20% pour le test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# random_state garantit que les résultats sont reproductibles à chaque exécution
print(f"Train set size: {X_train.shape[0]} rows")
print(f"Test set size: {X_test.shape[0]} rows")
print("...Done.")
print()

Dividing dataset into train and test sets...
Train set size: 8 rows
Test set size: 2 rows
...Done.



In [8]:
# Create pipeline for numeric features
numeric_features = ["Age", "Salary"]  # Names of numeric columns in X_train/X_test
numeric_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="median"),
        ),  # missing values will be replaced by columns' median
        ("scaler", StandardScaler()),
    ]
)

In [9]:
# Create pipeline for categorical features
categorical_features = ["Country"]  # Names of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="most_frequent"),
        ),  # missing values will be replaced by most frequent value
        (
            "encoder",
            OneHotEncoder(drop="first"),
        ),  # first column will be dropped to avoid creating correlations between features
    ]
)

In [10]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [11]:
# Prétraitements sur l'ensemble d'entraînement
print("Performing preprocessings on train set...")
print(X_train.head())

# Appliquer les transformations
X_train = preprocessor.fit_transform(X_train)
print("...Done.")
print(
    X_train[0:5]
)

# Prétraitements sur l'ensemble de test
print("Performing preprocessings on test set...")
print(X_test.head())

# Appliquer les transformations sur l'ensemble de test
X_test = preprocessor.transform(X_test)

print("...Done.")
print(
    X_test[0:5, :]
)  # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

Performing preprocessings on train set...
   Country   Age  Salary
4  Germany  40.0   69000
9   France  37.0   67000
1    Spain  27.0   48000
6    Spain   NaN   52000
7   France  48.0   79000
...Done.
[[ 0.27978024  0.58858382  1.          0.        ]
 [-0.23673712  0.38385901  0.          0.        ]
 [-1.95846165 -1.56102665  0.          1.        ]
 [-0.06456467 -1.15157703  0.          1.        ]
 [ 1.65715986  1.61220785  0.          0.        ]]
Performing preprocessings on test set...
   Country   Age  Salary
2  Germany  30.0   54000
8  Germany  50.0   83000
...Done.
[[-1.44194429 -0.94685223  1.          0.        ]
 [ 2.00150476  2.02165746  1.          0.        ]]



In [12]:
# This will create an AttributeError because X_train is not a pandas DataFrame anymore !
X_train.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [13]:
# Actually, X_train is a numpy array
type(X_train)

numpy.ndarray

In [14]:
# Numpy syntax to display 5 first lines
X_train[0:5]

array([[ 0.27978024,  0.58858382,  1.        ,  0.        ],
       [-0.23673712,  0.38385901,  0.        ,  0.        ],
       [-1.95846165, -1.56102665,  0.        ,  1.        ],
       [-0.06456467, -1.15157703,  0.        ,  1.        ],
       [ 1.65715986,  1.61220785,  0.        ,  0.        ]])

In [15]:
# Encoder la variable cible Y
labelencoder = LabelEncoder()

# Encodage des labels de l'ensemble d'entraînement
print("Encoding labels on train set...")
print(Y_train.head())
print()
Y_train = labelencoder.fit_transform(Y_train)
print("...Done.")
print(Y_train[:5])  # Premieres valeurs encodées
print()

# Vérification des classes encodées
print(f"Classes found in training set: {list(labelencoder.classes_)}")
print()

# Encodage des labels de l'ensemble de test
print("Encoding labels on test set...")
print(Y_test.head())
print()
Y_test = labelencoder.transform(Y_test)  # Ne pas réajuster sur le test
print("...Done.")
print(Y_test[:5])
print()

# Optionnel : récupérer les labels originaux si nécessaire
# Y_train_original = labelencoder.inverse_transform(Y_train)

Encoding labels on train set...
4    Yes
9    Yes
1    Yes
6     No
7    Yes
Name: Purchased, dtype: object

...Done.
[1 1 1 0 1]

Classes found in training set: ['No', 'Yes']

Encoding labels on test set...
2    No
8    No
Name: Purchased, dtype: object

...Done.
[0 0]



In [16]:
# Train model
model = LogisticRegression()

print("Training model...")
model.fit(X_train, Y_train)  # Training is always done on train set !!
print("...Done.")

Training model...
...Done.


In [17]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = model.predict(X_train)
print("...Done.")
print(Y_train_pred[0:5])
print()

Predictions on training set...
...Done.
[1 1 1 0 1]



In [18]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = model.predict(X_test)
print("...Done.")
print(Y_test_pred[0:5])
print()

Predictions on test set...
...Done.
[1 1]



In [19]:
# Print scores
print("Accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))

# Always pass true label first, and predictions in second position
print("Accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

Accuracy on training set :  0.75
Accuracy on test set :  0.0


In [20]:
# Print scores
print("Accuracy on training set : ", model.score(X_train, Y_train))

# Here, the features must be passed first, and then the true label
print("Accuracy on test set : ", model.score(X_test, Y_test))

Accuracy on training set :  0.75
Accuracy on test set :  0.0
