In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score

In [6]:
print("Loading dataset...")
dataset = pd.read_csv("src/Data.csv")
print("Dataset loaded successfully. Here are the first few rows:")
display(dataset.head())

Loading dataset...
Dataset loaded successfully. Here are the first few rows:


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000,No
1,Spain,27.0,48000,Yes
2,Germany,30.0,54000,No
3,Spain,38.0,61000,No
4,Germany,40.0,69000,Yes


In [7]:
# Séparation de la variable cible et des caractéristiques
print("Separating labels from features...")
target_variable = "Salary"

# Vérification de l'existence de la variable cible dans le jeu de données
if target_variable in dataset.columns:
    X = dataset.drop(columns=[target_variable])
    Y = dataset[target_variable]
    print("...Done.\n")

    print("Y (Target Variable):")
    print(Y.head(), "\n")
    
    print("X (Features):")
    print(X.head())
else:
    print(f"Error: The target variable '{target_variable}' does not exist in the dataset.")

Separating labels from features...
...Done.

Y (Target Variable):
0    72000
1    48000
2    54000
3    61000
4    69000
Name: Salary, dtype: int64 

X (Features):
   Country   Age Purchased
0   France  44.0        No
1    Spain  27.0       Yes
2  Germany  30.0        No
3    Spain  38.0        No
4  Germany  40.0       Yes


In [8]:
# Détection automatique des colonnes numériques et catégorielles
numeric_features = [col for col in X.columns if X[col].dtype in ['float64', 'int64']]
categorical_features = [col for col in X.columns if X[col].dtype == 'object']

print("Found numeric features:", numeric_features)
print("Found categorical features:", categorical_features)

Found numeric features: ['Age']
Found categorical features: ['Country', 'Purchased']


In [9]:
# Division du dataset en ensembles d'entraînement et de test
print("Dividing dataset into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print(f"Train set: {X_train.shape[0]} samples, Test set: {X_test.shape[0]} samples")
print("...Done.\n")

Dividing dataset into train and test sets...
Train set: 8 samples, Test set: 2 samples
...Done.



# Preprocessing

In [10]:
# Pipeline de transformation pour les caractéristiques numériques
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Remplace les valeurs manquantes par la moyenne des colonnes
    ('scaler', StandardScaler())  # Met à l'échelle les données en les centrant et les réduisant
])

print("Numeric feature transformer pipeline created.")

Numeric feature transformer pipeline created.


In [11]:
# Pipeline de transformation pour les caractéristiques catégorielles
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first'))  # Applique un encodage One-Hot en supprimant la première catégorie pour éviter les corrélations
])

print("Categorical feature transformer pipeline created.")

Categorical feature transformer pipeline created.


In [12]:
# Création de l'objet de prétraitement qui applique les transformations aux colonnes numériques et catégorielles
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),     # Applique le pipeline numérique aux caractéristiques numériques
        ('cat', categorical_transformer, categorical_features)  # Applique le pipeline catégoriel aux caractéristiques catégorielles
    ]
)

print("Preprocessor object created with transformations for numeric and categorical features.")

Preprocessor object created with transformations for numeric and categorical features.


In [13]:
# Prétraitement sur l'ensemble d'entraînement
print("Performing preprocessing on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print("...Done.")

# Convertit en DataFrame pour garder les noms de colonnes après transformation
X_train = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
print(X_train.head())

print()

# Prétraitement sur l'ensemble de test
print("Performing preprocessing on test set...")
print(X_test.head())
X_test = preprocessor.transform(X_test)  # Ne pas refit ici
print("...Done.")

# Convertit en DataFrame pour garder les noms de colonnes après transformation
X_test = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())
print(X_test.head())
print()

Performing preprocessing on train set...
   Country   Age Purchased
4  Germany  40.0       Yes
9   France  37.0       Yes
1    Spain  27.0       Yes
6    Spain   NaN        No
7   France  48.0       Yes
...Done.
   num__Age  cat__Country_Germany  cat__Country_Spain  cat__Purchased_Yes
0  0.270637                   1.0                 0.0                 1.0
1 -0.246034                   0.0                 0.0                 1.0
2 -1.968271                   0.0                 1.0                 1.0
3  0.000000                   0.0                 1.0                 0.0
4  1.648427                   0.0                 0.0                 1.0

Performing preprocessing on test set...
   Country   Age Purchased
2  Germany  30.0        No
8  Germany  50.0        No
...Done.
   num__Age  cat__Country_Germany  cat__Country_Spain  cat__Purchased_Yes
0 -1.451600                   1.0                 0.0                 0.0
1  1.992875                   1.0                 0.0            

# Cross-validated score for a Ridge model (with default value of λ)

In [14]:
# 3-fold cross-validation avec modèle Ridge pour évaluer le score R2
print("Performing 3-fold cross-validation with Ridge regression...")
regressor = Ridge()
scores = cross_val_score(regressor, X_train, Y_train, cv=3, scoring='r2')

print(f'The mean cross-validated R2-score is: {scores.mean():.4f}')
print(f'The standard deviation of R2-scores is: {scores.std():.4f}')

Performing 3-fold cross-validation with Ridge regression...
The mean cross-validated R2-score is: 0.7149
The standard deviation of R2-scores is: 0.0927


# Grid search : tune λ

In [15]:
# Recherche par grille (Grid Search) pour la régression Ridge
print("Performing grid search with cross-validation for Ridge regression...")

# Grille des valeurs de l'hyperparamètre alpha à tester
params = {
    'alpha': [0.0, 0.1, 0.5, 1.0]  # 0 correspond à l'absence de régularisation
}

# Configuration de la recherche par grille avec validation croisée
gridsearch = GridSearchCV(estimator=regressor, param_grid=params, cv=3, scoring='r2', n_jobs=-1, verbose=1)
gridsearch.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters:", gridsearch.best_params_)
print(f"Best cross-validated R2 score: {gridsearch.best_score_:.4f}")

Performing grid search with cross-validation for Ridge regression...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
...Done.
Best hyperparameters: {'alpha': 1.0}
Best cross-validated R2 score: 0.7149


# Performance assessment

In [16]:
# Print R^2 scores
print("R2 score on training set : ", gridsearch.score(X_train, Y_train))
print("R2 score on test set : ", gridsearch.score(X_test, Y_test))

R2 score on training set :  0.8859961574542502
R2 score on test set :  0.931688781048901


# Final remarks
Here, we can see that the model's generalized performance was improved by using a Ridge regression and tuning the value of the regularization strength. Indeed, without regularization, the R2 typically varies between 0.6 and 0.8, whereas with a regularized model we achieve a test score greater than 0.9 🥳🥳