In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio


In [5]:
#import Dataset
df = pd.read_csv("src/Walmart_Store_sales.csv")
df.head(1)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858


# I - EDA

# II - Linear regression  - Baseline model

In [6]:
# sup lignes avec Weekly_Sales Nan
df = df.dropna(subset = ['Weekly_Sales'], axis=0)
# sup colonne date 
df.drop(columns=['Date'], inplace=True)


# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ['Store','Holiday_Flag','Temperature','Fuel_Price','CPI','Unemployment']
target_variable = "Weekly_Sales"


dataset = df

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

numeric_indices = [2,3,4,5]
categorical_indices = [0,1]


print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.tolist()
Y_test = Y_test.tolist()


# Créer un pipeline pour les fonctionnalités numériques & valeurs values

numeric_features = [2,3,4,5] # Positions of numeric columns in X_train/X_test : Age et Salary
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Créer un pipeline pour les fonctionnalités catégorielles - ici Store - Holiday_Flag & missing values

categorical_features = [0,1] # Positions of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Utilisez ColumnTransformer pour créer un objet préprocesseur qui décrit tous les traitements à effectuer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])




Separating labels from features...
Dividing into train and test sets...
Convert pandas DataFrames to numpy arrays...


### II-1 : Training Pipeline

In [7]:
# Preprocessings (Prétraitements) on train set
print("Prétraitements on train set: valeurs manquantes, Scale ...")

X_train = preprocessor.fit_transform(X_train)


Prétraitements on train set: valeurs manquantes, Scale ...


In [None]:
# pas de label encoder sur Y car Numerique

#labelencoder = LabelEncoder()

#print("Encoding labels on train set...")
#Y_train = labelencoder.fit_transform(Y_train)

In [8]:
# Train model

model = LinearRegression()

print("Training model...")
model.fit(X_train, Y_train) # Training is always done on train set !!


Training model...


LinearRegression()

In [9]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = model.predict(X_train)

Y_train_pred[0:3]

Predictions on training set...


array([ 516482.99754619, 1156582.89497736, 1439732.70890009])

### II-2 : Test Pipeline

In [10]:

# Preprocessings (Prétraitements) on test set
print("Prétraitements on test set...")

X_test = preprocessor.transform(X_test)

Prétraitements on test set...


In [11]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = model.predict(X_test)


Predictions on test set...


In [12]:
print("comaraison Y_test et Y_test_pred..")

compa_Y = pd.DataFrame({'reality':Y_test, 'predi':Y_test_pred})
compa_Y[0:5]

comaraison Y_test et Y_test_pred..


Unnamed: 0,reality,predi
0,757738.76,951020.1
1,418925.47,433548.9
2,1532308.78,1641768.0
3,1466046.67,1503866.0
4,268929.03,341778.7


### II-3 : Évaluation des performances

In [13]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9617837897020081
R2 score on test set :  0.9483868355993639


In [14]:
# Evaluation du modéle
from sklearn import metrics

print(f'MEA: {metrics.mean_absolute_error(Y_test,Y_test_pred)}')
print(f'MSE: {metrics.mean_squared_error(Y_test,Y_test_pred)}')
print(f'RMSE: {np.sqrt(metrics.mean_squared_error(Y_test,Y_test_pred))}')

MEA: 111659.25652383358
MSE: 21892856146.91282
RMSE: 147962.34705800263


In [15]:
metrics.explained_variance_score(Y_test,Y_test_pred)

0.950044869949645

In [None]:
#plt.style.use('seaborn-dark')



# II -  Régression linéaire régularisée avec Ridge

### II -1 import dataset - separation train & test set - preparation des variables numeriques & catégorielles

In [21]:
#import Dataset
df = pd.read_csv("src/Walmart_Store_sales.csv")

# sup lignes avec Weekly_Sales Nan
df = df.dropna(subset = ['Weekly_Sales'], axis=0)
# sup colonne date 
df.drop(columns=['Date'], inplace=True)


# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ['Store','Holiday_Flag','Temperature','Fuel_Price','CPI','Unemployment']
target_variable = "Weekly_Sales"


dataset = df

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

numeric_indices = [2,3,4,5]
categorical_indices = [0,1]


print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.tolist()
Y_test = Y_test.tolist()


# Créer un pipeline pour les fonctionnalités numériques & valeurs values

numeric_features = [2,3,4,5] # Positions of numeric columns in X_train/X_test : Age et Salary
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Créer un pipeline pour les fonctionnalités catégorielles - ici Store - Holiday_Flag & missing values

categorical_features = [0,1] # Positions of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Utilisez ColumnTransformer pour créer un objet préprocesseur qui décrit tous les traitements à effectuer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])



Separating labels from features...
Dividing into train and test sets...
Convert pandas DataFrames to numpy arrays...


### II-2 - Training pipeline

In [23]:
# Missing values

# Preprocessings (Prétraitements) on train set
print("Prétraitements on train set: valeurs manquantes, Scale ...")

X_train = preprocessor.fit_transform(X_train)

# Train model

model = LinearRegression()

print("Training model...")
model.fit(X_train, Y_train)


Prétraitements on train set: valeurs manquantes, Scale ...
Training model...


LinearRegression()

#### a- Cross-validated score for a Ridge model (with default value of  λ )

In [24]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score obtained with a Ridge model
print("3-fold cross-validation...")
regressor = Ridge()
scores = cross_val_score(regressor, X_train, Y_train, cv=3)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

# standart déviation ou écart type : dispersion des données par rapport à la moyenne,
#Un écart type proche de zéro indique que les points de données sont proches de la moyenne

3-fold cross-validation...
The cross-validated R2-score is :  0.8538981764890422
The standard deviation is :  0.05594174663336293


#### b- Grid search

In [25]:
# Nous recherchons les meilleurs paramétres avec la librairie de sklearn : Grid Seach
# qui permet de sélectionner les meilleurs paramètres parmi les hyperparamètres répertoriés

# Perform grid search
print("Grid search...")
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.05, 0.1,0.2,0.5,1.0] # 0 corresponds to no regularization
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)

Grid search...

Best hyperparameters :  {'alpha': 0.05}
Best R2 score :  0.9198185341161347


In [26]:
# Predictions on training set

print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)

Predictions on training set...


### II_3 - Test pipeline with Grid search

In [28]:
# Missing values
X_test[:,numeric_indices] = imputer.transform(X_test[:,numeric_indices])

# Encoding categorical features and standardizing numerical features
print("Encoding categorical features and standardizing numerical features...")

X_test = preprocessor.transform(X_test)

Encoding categorical features and standardizing numerical features...


In [29]:
# prediction

Y_test_pred = gridsearch.predict(X_test)

print(Y_test_pred)

[ 960517.5937938   437720.14965411 1651056.48691219 1470786.3133123
  350973.68200533  464325.82888609 1340704.11438538 2023873.6863152
 1384557.89290942  411162.26361408  611395.98871814 1511088.0963789
 1950992.18491328 1579284.61366373 1945407.78129948 1378334.93421671
  570999.73254358 2116354.39761071 2039540.12533715  937007.45653239
 1929996.1848169   597652.23987396 1929764.19154327  351939.12588648
 2039048.0929218  2000150.25736868 2124703.21616958 1271408.95077164]


In [30]:
# Print R^2 scores on train/test sets for the Ridge model with optimal value of the regularization strength

print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9602438002618924
R2 score on test set :  0.9402364882097568


In [31]:
# Evaluation du modéle Ridge
from sklearn import metrics

print(f'MEA of Lasso Regression:: {metrics.mean_absolute_error(Y_test,Y_test_pred)}')
print(f'MSE: {metrics.mean_squared_error(Y_test,Y_test_pred)}')
print(f'RMSE: {np.sqrt(metrics.mean_squared_error(Y_test,Y_test_pred))}')

MEA of Lasso Regression:: 121065.15217552808
MSE: 25350004822.45178
RMSE: 159216.8484251958
