# Régression linéaire multiple

In [5]:
# Import des librairies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

---
## Les appartements

### Les données

In [6]:
# le dataset
df = pd.read_csv('data/apparts.csv')
df.head()

Unnamed: 0,taille_en_pieds_carre,nb_chambres,prix
0,2104,3,399900
1,1600,3,329900
2,2400,3,369000
3,1416,2,232000
4,3000,4,539900


In [7]:
# conversion en m²
df['taille_m2'] = 0.092903 * df['taille_en_pieds_carre']
df.head()

Unnamed: 0,taille_en_pieds_carre,nb_chambres,prix,taille_m2
0,2104,3,399900,195.467912
1,1600,3,329900,148.6448
2,2400,3,369000,222.9672
3,1416,2,232000,131.550648
4,3000,4,539900,278.709


In [8]:
# matrice X
X = df[['taille_m2', 'nb_chambres']]
X.head()

Unnamed: 0,taille_m2,nb_chambres
0,195.467912,3
1,148.6448,3
2,222.9672,3
3,131.550648,2
4,278.709,4


In [9]:
# vecteur y
y = df[['prix']]
y.head()

Unnamed: 0,prix
0,399900
1,329900
2,369000
3,232000
4,539900


In [10]:
# vérifier les tailles
X.shape, y.shape

((47, 2), (47, 1))

### Création des échantillons de test et d'entraînement

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10)

In [31]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((37, 2), (10, 2), (37, 1), (10, 1))

### Construction du modèle

#### Scikit-learn

In [32]:
# le modèle
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

In [33]:
# évaluation R2
sc_train = lr.score(X_train, y_train)
sc_test = lr.score(X_test, y_test)
print(f"{sc_train=} et {sc_test=}")

sc_train=0.7342512950385309 et sc_test=0.5614373628524003


In [34]:
# évaluation RMSE
from sklearn.metrics import mean_squared_error
mse_train = mean_squared_error(y_train, lr.predict(X_train), squared=False)
mse_test = mean_squared_error(y_test, lr.predict(X_test), squared=False)
print(f"{mse_train=} et {mse_test=}")

mse_train=68573.84633878572 et mse_test=43882.46683029856


In [35]:
# prédictions
pred = pd.DataFrame()
pred['y_true'] = y_test
pred['y_pred'] = lr.predict(X_test)
#pred['erreur_absolue'] = abs(pred['y_true'] - pred['y_pred'])
pred

Unnamed: 0,y_true,y_pred
1,329900,291718.660419
37,345000,356031.707244
12,329999,331367.407001
0,399900,360625.447582
31,169900,226530.561302
34,285900,312144.646372
2,369000,401094.513059
5,299900,335933.894321
10,239999,329781.50261
3,232000,274984.109493


#### Statsmodels

In [36]:
# construction du modèle
from statsmodels.api import OLS, add_constant
lr2 = OLS(y_train, add_constant(X_train))
res = lr2.fit()

In [37]:
# évaluation du modèle
res.summary()

0,1,2,3
Dep. Variable:,prix,R-squared:,0.734
Model:,OLS,Adj. R-squared:,0.719
Method:,Least Squares,F-statistic:,46.97
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,1.64e-10
Time:,15:45:07,Log-Likelihood:,-464.52
No. Observations:,37,AIC:,935.0
Df Residuals:,34,BIC:,939.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.823e+04,5.57e+04,1.764,0.087,-1.5e+04,2.11e+05
taille_m2,1471.6405,181.696,8.099,0.000,1102.390,1840.891
nb_chambres,-8421.8952,2.1e+04,-0.402,0.690,-5.1e+04,3.42e+04

0,1,2,3
Omnibus:,1.923,Durbin-Watson:,1.525
Prob(Omnibus):,0.382,Jarque-Bera (JB):,1.311
Skew:,0.46,Prob(JB):,0.519
Kurtosis:,3.056,Cond. No.,1030.0


In [41]:
# Nouvelles prédictions
res.predict(add_constant(X_test))

1     291718.660419
37    356031.707244
12    331367.407001
0     360625.447582
31    226530.561302
34    312144.646372
2     401094.513059
5     335933.894321
10    329781.502610
3     274984.109493
dtype: float64

### Exo

Refaire la régression en standardisant les variables explicatives. Soit "à la main", soit avec le StandardScaler de `sklearn`

---
## Les entreprises

### Les données

In [43]:
# le dataset
df = pd.read_csv('data/entreprises.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [44]:
# X et y
y = df.Profit
X = df.drop('Profit', axis=1)
X.shape, y.shape

((50, 4), (50,))

### Gestion des variables catégoriques

In [45]:
# Répartition
df.State.value_counts()

State
New York      17
California    17
Florida       16
Name: count, dtype: int64

In [50]:
X = df.drop('Profit', axis=1)

# avec pd.get_dummies
X = pd.get_dummies(data=X, columns=['State'], drop_first=True,
                   dtype=int, prefix="", prefix_sep="")
# on supprime une des indicatrices avec drop_first pour éviter d'avoir des variables colinéaires :
# si on les garde toutes, on a "California + Florida + New York = 1", or on en veut pas
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


### Le modèle

In [51]:
from statsmodels.api import OLS, add_constant
lr = OLS(y, add_constant(X))
res = lr.fit()
res.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,1.34e-27
Time:,15:59:29,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
R&D Spend,0.8060,0.046,17.369,0.000,0.712,0.900
Administration,-0.0270,0.052,-0.517,0.608,-0.132,0.078
Marketing Spend,0.0270,0.017,1.574,0.123,-0.008,0.062
Florida,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
New York,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [52]:
from statsmodels.api import OLS, add_constant
lr = OLS(y, add_constant(X.drop("New York", axis=1)))
res = lr.fit()
res.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,8.49e-29
Time:,16:02:34,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
R&D Spend,0.8060,0.046,17.606,0.000,0.714,0.898
Administration,-0.0270,0.052,-0.523,0.604,-0.131,0.077
Marketing Spend,0.0270,0.017,1.592,0.118,-0.007,0.061
Florida,220.1585,2900.536,0.076,0.940,-5621.821,6062.138

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [53]:
from statsmodels.api import OLS, add_constant
lr = OLS(y, add_constant(X.drop(["New York", "Florida"], axis=1)))
res = lr.fit()
res.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,4.53e-30
Time:,16:03:42,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
R&D Spend,0.8057,0.045,17.846,0.000,0.715,0.897
Administration,-0.0268,0.051,-0.526,0.602,-0.130,0.076
Marketing Spend,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [54]:
from statsmodels.api import OLS, add_constant
lr = OLS(y, add_constant(X.drop(["New York", "Florida", "Administration"], axis=1)))
res = lr.fit()
res.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,2.1600000000000003e-31
Time:,16:04:47,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
R&D Spend,0.7966,0.041,19.266,0.000,0.713,0.880
Marketing Spend,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0
