In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


datapath = 'Jeux_de_donnees/'

# Load the data
housing = pd.read_csv(datapath + "housing.csv", thousands=',')

#### Discrétiser une variable

In [135]:
housing["median_house_value_disc"] = pd.cut(housing["median_house_value"],
                               bins=[0, 100000, 200000, 300000, 400000, 500000, np.inf],
                               labels=["class1", "class2", "class3", "class4", "class5", "class6"])

housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,median_house_value_disc
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,class5
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,class4
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,class4
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,class4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,class4


In [136]:
housing["longitude_disc"] = pd.cut(housing["longitude"],
                               bins=[-124, -122, -120, -118, -116, -114, -112, np.inf],
                               labels=["class1", "class2", "class3",
                                       "class4", "class5", "class6", "class7"])

housing["longitude_disc"].head()

0    class1
1    class1
2    class1
3    class1
4    class1
Name: longitude_disc, dtype: category
Categories (7, object): [class1 < class2 < class3 < class4 < class5 < class6 < class7]

#### Stratification

In [138]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1,
                               test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["median_house_value_disc"]):
    print(train_index)
    print(test_index )

[13175  7890  9496 ... 17481  2130  2116]
[ 8405 10283  2550 ...  7276  6497  8232]


In [139]:
for train_index, test_index in split.split(housing, housing["median_house_value_disc"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [140]:
strat_test_set["median_house_value_disc"].value_counts(normalize=True)

class2    0.400921
class3    0.235950
class1    0.177326
class4    0.101260
class6    0.046754
class5    0.037791
Name: median_house_value_disc, dtype: float64

In [141]:
housing["median_house_value_disc"].value_counts(normalize=True)

class2    0.400824
class3    0.236095
class1    0.177229
class4    0.101357
class6    0.046754
class5    0.037742
Name: median_house_value_disc, dtype: float64

### Integrer une variable qualitative

In [142]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

##### On va dabord créer les variables indicatrices

In [143]:
housing=housing.dropna()

cat_encoder = OneHotEncoder()
housing_long_1hot = cat_encoder.fit_transform(housing[["longitude_disc"]])
type(housing_long_1hot)

scipy.sparse.csr.csr_matrix

In [144]:
housing_long_1hot.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

#### On va créer deux pipeline distinct, un pour traiter les variables qualitatives et l'autre pour les variables quantitative. On va ensuite les combiner grace à la fonction ColumnTransformer.

In [145]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_attribs = ["median_income", "latitude", "total_rooms"]
cat_attribs = ["longitude_disc"]

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

#### J'enlève les valeurs manquantes

In [146]:
strat_train_set = strat_train_set.dropna()
strat_test_set = strat_test_set.dropna()

#### Je définis mon jeu d'entrainemnt (X_train) à partir de mon full_pipeline et de la méthode fit transforme (cela va me retourner mes données avec les transformation tel que je les ai définis précédement). Idem pour le X_test.

In [147]:
X_train = full_pipeline.fit_transform(strat_train_set)
y_train = np.array(strat_train_set["median_house_value"])

X_test = full_pipeline.fit_transform(strat_test_set)
y_test = np.array(strat_test_set["median_house_value"])

"""
model = make_pipeline(StandardScaler(),
                     SGDRegressor(max_iter=1000, alpha=0.5))
"""

model = SGDRegressor(max_iter=1000, alpha=0.5)

model.fit(X_train, y_train)

SGDRegressor(alpha=0.5)

In [148]:
metric = {'modele': 'model ',
          'mean_absolute_error': mean_absolute_error(y_test, model.predict(X_test)),
          'mean_squared_error': mean_squared_error(y_test, model.predict(X_test)),
          'r2_score':r2_score(y_test, model.predict(X_test))
         }

metric["r2_score"]

0.45971359879505114