In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import warnings


# Ignorer les warnings de dépréciation pour un code plus propre
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Importation de Plotly pour la visualisation des données
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# Définir une palette de couleurs personnalisée pour Plotly
jedha_template = go.layout.Template(
    layout=go.Layout(
        colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
    )
)
pio.templates["jedha"] = jedha_template

# Spécifier le renderer par défaut pour les graphiques
pio.renderers.default = "svg"


# The dataset
Load the California Housing dataset again:

In [3]:
from sklearn import datasets

# Télécharger les données California Housing et les charger dans un DataFrame Pandas
data = datasets.fetch_california_housing()

# Créer un DataFrame à partir des features et ajouter la variable cible 'Price'
dataset = pd.DataFrame(data=data["data"], columns=data["feature_names"])
dataset['Price'] = data["target"]

# Afficher les premières lignes du DataFrame
dataset.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


remove the outliers:

In [4]:
# Création d'un masque pour filtrer les outliers
mask = (
    (dataset['AveRooms'] < 10) &
    (dataset['AveBedrms'] < 10) &
    (dataset['Population'] < 15000) &
    (dataset['AveOccup'] < 10) &
    (dataset['Price'] < 5)
)

# Appliquer le masque pour filtrer le dataset
dataset_filtered = dataset.loc[mask].copy()  # Utiliser .copy() pour éviter des copies d'avertissement

In [5]:
# Affichage du nombre de lignes dans le dataset
print(f"Number of rows: {dataset_filtered.shape[0]}\n")

# Affichage des premières lignes du dataset filtré
print("Display of dataset (first 5 rows):")
display(dataset_filtered.head())
print()

# Statistiques descriptives de base pour le dataset
print("Basic statistics: ")
data_desc = dataset_filtered.describe(include='all')
display(data_desc)
print()

# Calcul et affichage du pourcentage de valeurs manquantes par colonne
print("Percentage of missing values per column: ")
missing_values = (100 * dataset_filtered.isnull().sum() / dataset_filtered.shape[0]).round(2)
display(missing_values)

Number of rows: 19398

Display of dataset (first 5 rows):


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422



Basic statistics: 


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
count,19398.0,19398.0,19398.0,19398.0,19398.0,19398.0,19398.0,19398.0,19398.0
mean,3.674497,28.496907,5.210648,1.066038,1442.17208,2.94464,35.637872,-119.567484,1.924128
std,1.563397,12.477953,1.168098,0.128846,1077.498768,0.766194,2.14296,2.004793,0.971784
min,0.4999,1.0,0.846154,0.333333,3.0,0.75,32.54,-124.35,0.14999
25%,2.5259,18.0,4.407329,1.005413,805.0,2.450413,33.93,-121.77,1.167
50%,3.4478,29.0,5.170038,1.047619,1185.5,2.842105,34.26,-118.49,1.741
75%,4.583175,37.0,5.944617,1.096884,1752.0,3.308127,37.72,-118.0,2.485
max,15.0001,52.0,9.979167,3.411111,13251.0,9.954545,41.95,-114.55,4.991



Percentage of missing values per column: 


MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
Price         0.0
dtype: float64

Separate the target from the features

In [6]:
# Séparation de la variable cible Y et des features X
print("Separating labels from features...")

# La variable cible
target_variable = "Price"

# Sélection de toutes les features sauf 'Price'
X = dataset_filtered.drop(target_variable, axis=1)  # On supprime la colonne 'Price' pour obtenir X
Y = dataset_filtered[target_variable]  # On garde uniquement la colonne 'Price' comme variable cible

print("...Done.")
print()

# Affichage des premières lignes de Y (variable cible)
print('Y (Target - Price) :')
print(Y.head())
print()

# Affichage des premières lignes de X (features)
print('X (Features) :')
print(X.head())

Separating labels from features...
...Done.

Y (Target - Price) :
0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: Price, dtype: float64

X (Features) :
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  


# From linear to non-linear regression

An easy way of implementing a non-linear regression is to create by hand more columns containing non-linear functions of the features.

For each explanatory variable, create 3 new columns in X containing the following functions:
X 2
 
X 3
 
X 4

1/X
 
1 / X2

In [7]:
# Génération des features polynomiales et inverses pour chaque feature existante
features_list = X.columns

for c in features_list:
    X[c + '_2'] = X[c] ** 2
    X[c + '_3'] = X[c] ** 3
    X[c + '_4'] = X[c] ** 4
    
    # Gestion des divisions par zéro pour les inverses
    X[c + '_inverse'] = X[c].replace(0, np.nan).apply(lambda x: 1/x if x != 0 else np.nan)
    X[c + '_inverse2'] = X[c].replace(0, np.nan).apply(lambda x: 1/(x**2) if x != 0 else np.nan)

# Afficher les premières lignes du DataFrame transformé
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_2,MedInc_3,...,Latitude_2,Latitude_3,Latitude_4,Latitude_inverse,Latitude_inverse2,Longitude_2,Longitude_3,Longitude_4,Longitude_inverse,Longitude_inverse2
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,577.010912,...,1434.8944,54353.799872,2058922.0,0.026399,0.000697,14940.1729,-1826137.0,223208800.0,-0.008181,6.7e-05
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,572.076387,...,1433.3796,54267.751656,2054577.0,0.026413,0.000698,14937.7284,-1825689.0,223135700.0,-0.008182,6.7e-05
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,382.246204,...,1432.6225,54224.761625,2052407.0,0.02642,0.000698,14942.6176,-1826586.0,223281800.0,-0.008181,6.7e-05
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,179.702136,...,1432.6225,54224.761625,2052407.0,0.02642,0.000698,14945.0625,-1827034.0,223354900.0,-0.00818,6.7e-05
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,56.897815,...,1432.6225,54224.761625,2052407.0,0.02642,0.000698,14945.0625,-1827034.0,223354900.0,-0.00818,6.7e-05


Split your dataset into train (80%) and test (20%)

In [8]:
# Division du dataset en ensembles d'entraînement et de test
print("Dividing dataset into train and test sets...")

# Division avec 80% pour l'entraînement et 20% pour le test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

print("...Done.")
print()

# Affichage des dimensions des ensembles d'entraînement et de test
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")

Dividing dataset into train and test sets...
...Done.

X_train shape: (15518, 48)
X_test shape: (3880, 48)
Y_train shape: (15518,)
Y_test shape: (3880,)


Apply the same preprocessing as in the previous exercise

In [9]:
# Prétraitement : standardisation des features de X_train
print("Preprocessing X_train...")
print("First 5 rows of X_train before scaling:")
print(X_train.head())  # Afficher les premières lignes avant la standardisation
print()

# Initialiser le scaler et ajuster/transformer X_train
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Transformer et standardiser les données

print("...Done!")
print("First 5 rows of X_train after scaling:")
print(X_train_scaled[:5, :])  # Afficher les 5 premières lignes après transformation (maintenant tableau numpy)

Preprocessing X_train...
First 5 rows of X_train before scaling:
       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
3235   2.3889       6.0  6.316614   1.294671       992.0  3.109718     36.09   
13981  3.4912       7.0  8.355308   1.554795      2933.0  2.511130     34.85   
9219   1.9464      36.0  4.975510   1.053061       639.0  2.608163     37.12   
10851  3.1667      22.0  3.803838   1.000000      1952.0  2.081023     33.66   
8888   4.2520      31.0  3.978296   1.039389      1985.0  1.595659     34.03   

       Longitude   MedInc_2   MedInc_3  ...  Latitude_2    Latitude_3  \
3235     -119.57   5.706843  13.633078  ...   1302.4881  47006.795529   
13981    -117.46  12.188477  42.552412  ...   1214.5225  42326.109125   
9219     -120.27   3.788473   7.373884  ...   1377.8944  51147.440128   
10851    -117.90  10.027989  31.755632  ...   1132.9956  38136.631896   
8888     -118.49  18.079504  76.874051  ...   1158.0409  39408.131827   

         Latitu

In [10]:
# Prétraitement : standardisation des features de X_test (ne pas refitter le scaler)
print("Preprocessing X_test...")
print("First 5 rows of X_test before scaling:")
print(X_test.head())  # Afficher les premières lignes avant la standardisation
print()

# Appliquer la transformation déjà ajustée sur X_test
X_test_scaled = scaler.transform(X_test)  # Ne pas refitter, utiliser le scaler ajusté avec X_train

print("...Done!")
print("First 5 rows of X_test after scaling:")
print(X_test_scaled[:5, :])  # Afficher les 5 premières lignes après transformation (maintenant tableau numpy)

Preprocessing X_test...
First 5 rows of X_test before scaling:
       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
17333  5.2990      12.0  7.214932   1.047511      1200.0  2.714932     34.91   
1012   2.6667      44.0  4.541284   1.027523       277.0  2.541284     37.68   
5124   1.5521      30.0  3.850679   1.002262      1966.0  4.447964     33.99   
1845   6.3538      49.0  6.293886   1.017751      1148.0  2.264300     37.90   
4035   3.2154      20.0  4.133444   1.060181      7450.0  1.772122     34.17   

       Longitude   MedInc_2    MedInc_3  ...  Latitude_2    Latitude_3  \
17333    -120.44  28.079401  148.792746  ...   1218.7081  42545.099771   
1012     -121.77   7.111289   18.963674  ...   1419.7824  53497.400832   
5124     -118.26   2.409014    3.739031  ...   1155.3201  39269.330199   
1845     -122.28  40.370774  256.507827  ...   1436.4100  54439.939000   
4035     -118.52  10.338797   33.243368  ...   1167.5889  39896.512713   

         La