In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="white")

## Setting max displayed rows to 500, in order to display the full output of any command
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
#test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")

In [26]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder

from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.compose import ColumnTransformer

# pour afficher les pipelines
from sklearn import set_config
set_config(display='diagram')

# Fonctions de préparation

In [4]:
def log_prep_train_data(df):
    data = df.copy()
    # passage en catégoriel d'une variable numérique
    data["MSSubClass"] = data["MSSubClass"].astype(dtype="object")
    # log sur la target
    data["Log_SalePrice"] = np.log(data["SalePrice"])
    # outliers
    outliers_index = data[(data['LotArea'] > 100000) |
                (data["MasVnrArea"] > 1500) |
                (data['MiscVal']>6000) |
                (data['3SsnPorch']>350) |
                (data['EnclosedPorch']>400) |
                (data['OpenPorchSF']>450) |
                (data['1stFlrSF']>4000) |
                (data['TotalBsmtSF']>5000) |
                (data['GrLivArea']>4700)].index
    data = data.drop(outliers_index)
    # séparation X et y. Drop de LotFrontage où trop de valeurs manquantes
    X = data.drop(["Id", "LotFrontage", "SalePrice", "Log_SalePrice"], axis=1)
    y = data["Log_SalePrice"]
    return X, y

In [5]:
def name_cols(X):
    ordinal_cols = ["LotShape", "Utilities", "LandSlope", "Neighborhood", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", 
                "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "CentralAir", "KitchenQual",
               "FireplaceQu", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive",
               "PoolQC", "Fence"]
    cat_cols = [col for col in X.columns if X[col].dtype == "object"]
    ohe_cols = [col for col in cat_cols if col not in ordinal_cols]
    num_cols = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
    return ordinal_cols, cat_cols, ohe_cols, num_cols

In [6]:
def prep_test_data(df):
    data = df.copy()
    data["MSSubClass"] = data["MSSubClass"].astype(dtype="object")
    X = data.drop(["Id", "LotFrontage"], axis=1)
    return X

# Chargement des données et utlisation des fonctions de préparation

In [7]:
train = pd.read_csv("train.csv")

In [8]:
test = pd.read_csv("test.csv")

In [9]:
X_train, y_train = log_prep_train_data(train)
X_train.shape

(1446, 78)

In [10]:
X_test = prep_test_data(test)
X_test.shape

(1459, 78)

In [11]:
ordinal_cols, cat_cols, ohe_cols, num_cols = name_cols(X_test)
print(len(ordinal_cols) + len(ohe_cols))
print(len(cat_cols))
print(len(cat_cols) + len(num_cols))
print(len(ordinal_cols))

44
44
78
21


# Préprocessing

## Préprocessing ordinal

In [13]:
# ATTENTION FAIT APPEL AU DATAFRAME TRAIN (il ne faut pas qu'il fasse appel au test)

# créer liste ordonnée des modalités de Neighborhood en fonction du prix de vente
ordonned_neighborhood_df = train[["Neighborhood", "SalePrice"]].groupby(["Neighborhood"], as_index=False).mean()
ordonned_neighborhood_df = ordonned_neighborhood_df.sort_values(by="SalePrice")

ordonned_neighborhood = list(ordonned_neighborhood_df["Neighborhood"])


# listes ordonnées pour encodage ordinal : 
ordinal_codes = [["IR3", "IR2", "IR1", "Reg"], 
        ["ELO", "NoSeWa", "NoSewr", "AllPub"], 
        ["Sev", "Mod", "Gtl"], 
        ordonned_neighborhood, 
        ["Po", "Fa", "TA", "Gd", "Ex"], 
        ["Po", "Fa", "TA", "Gd", "Ex"], 
        ["Missing", "Po", "Fa", "TA", "Gd", "Ex"], 
        ["Missing", "Po", "Fa", "TA", "Gd", "Ex"], 
        ["Missing", "No", "Mn", "Av", "Gd"], 
        ["Missing", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"], 
        ["Missing", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"], 
        ["Po", "Fa", "TA", "Gd", "Ex"],
        ["N", "Y"], 
        ["Po", "Fa", "TA", "Gd", "Ex"],
        ["Missing", "Po", "Fa", "TA", "Gd", "Ex"], 
        ["Missing", "Unf", "RFn", "Fin"], 
        ["Missing", "Po", "Fa", "TA", "Gd", "Ex"], 
        ["Missing", "Po", "Fa", "TA", "Gd", "Ex"], 
        ["N", "P", "Y"], 
        ["Missing", "Fa", "TA", "Gd", "Ex"], 
        ["Missing", "MnWw", "GdWo", "MnPrv", "GdPrv"]]
len(ordinal_codes)


21

In [14]:
# imputer données manquantes :
ordinal_imputer = SimpleImputer(strategy="constant", fill_value="Missing")

# encoder feature catégorielles en ordinal :
ordinal_encoder = OrdinalEncoder(categories = ordinal_codes, handle_unknown = "use_encoded_value", unknown_value=-999)
                                 
# ORDINAL PIPELINE :
ordinal_pipe = Pipeline([("ordinal_imputer", ordinal_imputer), ("ordinal_encoder", ordinal_encoder)])

## Préprocessing One Hot

In [15]:
# imputer données manquantes :
ohe_imputer = SimpleImputer(strategy="constant", fill_value="Missing")

# encoder features catégorielles en one hot :
ohe_encoder = OneHotEncoder(handle_unknown="ignore")

# OHE PIPELINE :
ohe_pipe = Pipeline([("ohe_imputer", ohe_imputer), ("ohe_encoder", ohe_encoder)])

## Préprocessing Numérique

In [16]:
# imputer données manquantes :
num_imputer = SimpleImputer(strategy="constant", fill_value=0)

# scaler les variables numériques :
num_scaler = MinMaxScaler()

# NUMERICAL PIPELINE :
num_pipe = Pipeline([("num_imputer", num_imputer), ("num_scaler", num_scaler)])

## Pipeline de prépro global

In [17]:
prepro = ColumnTransformer([("ohe_pipe", ohe_pipe, ohe_cols), ("num_pipe", num_pipe, num_cols), ("ordinal_pipe", ordinal_pipe, ordinal_cols)])
prepro.get_params()

{'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('ohe_pipe',
   Pipeline(steps=[('ohe_imputer',
                    SimpleImputer(fill_value='Missing', strategy='constant')),
                   ('ohe_encoder', OneHotEncoder(handle_unknown='ignore'))]),
   ['MSSubClass',
    'MSZoning',
    'Street',
    'Alley',
    'LandContour',
    'LotConfig',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Foundation',
    'Heating',
    'Electrical',
    'Functional',
    'GarageType',
    'MiscFeature',
    'SaleType',
    'SaleCondition']),
  ('num_pipe',
   Pipeline(steps=[('num_imputer',
                    SimpleImputer(fill_value=0, strategy='constant')),
                   ('num_scaler', MinMaxScaler())]),
   ['LotArea',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    'MasVnrArea',

# Modèles et pipes associés

In [18]:
from sklearn.linear_model import Ridge, Lasso, LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [19]:
model_ridge = Ridge()
pipe_ridge = Pipeline([("prepro", prepro), ("model_ridge", model_ridge)])

In [20]:
model_svr = SVR()
pipe_svr = Pipeline([("prepro", prepro), ("model_svr", model_svr)])

In [21]:
model_forest = RandomForestRegressor()
pipe_forest = Pipeline([("prepro", prepro), ("model_forest", model_forest)])

In [22]:
model_lassocv = LassoCV()
pipe_lassocv = Pipeline([("prepro", prepro), ("lasso_cv", model_lassocv)])

# Définition de la métrique
Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

In [35]:
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
import math

In [36]:
def custom_score_log(y_true, y_pred):
    return math.sqrt(mean_squared_log_error(y_true, y_pred))

log_rmse = make_scorer(custom_score_log, greater_is_better=False)

In [37]:
def custom_score(y_true, y_pred):
    return math.sqrt(mean_squared_error(y_true, y_pred))


classic_rmse= make_scorer(custom_score, greater_is_better=False)

# Ridge

## Entrainement et prédiction sans split (Ridge)

In [42]:
X, y = log_prep_train_data(train)

In [43]:
pipe_ridge.fit(X, y)

In [44]:
cross_val_score(pipe_ridge, X_train, y_train, scoring=classic_rmse)

array([-0.11535955, -0.15321363, -0.13058881, -0.11493633, -0.11670488])

## Entrainement et prédiction avec split (Ridge)

In [45]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [46]:
pipe_ridge.fit(X_train, y_train)

In [48]:
cross_val_score(pipe_ridge, X_train, y_train, scoring=classic_rmse)

array([-0.12196351, -0.11388988, -0.13571353, -0.15703203, -0.11267503])

In [52]:
y_pred = pipe_ridge.predict(X_val)

In [53]:
custom_score(y_val, y_pred)

0.12508814703906207

## Optimisation du modèle Ridge :
Randomized Search + Grid Search
<br>Ensemble

In [64]:
pipe_ridge.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'prepro', 'model_ridge', 'prepro__n_jobs', 'prepro__remainder', 'prepro__sparse_threshold', 'prepro__transformer_weights', 'prepro__transformers', 'prepro__verbose', 'prepro__ohe_pipe', 'prepro__num_pipe', 'prepro__ordinal_pipe', 'prepro__ohe_pipe__memory', 'prepro__ohe_pipe__steps', 'prepro__ohe_pipe__verbose', 'prepro__ohe_pipe__ohe_imputer', 'prepro__ohe_pipe__ohe_encoder', 'prepro__ohe_pipe__ohe_imputer__add_indicator', 'prepro__ohe_pipe__ohe_imputer__copy', 'prepro__ohe_pipe__ohe_imputer__fill_value', 'prepro__ohe_pipe__ohe_imputer__missing_values', 'prepro__ohe_pipe__ohe_imputer__strategy', 'prepro__ohe_pipe__ohe_imputer__verbose', 'prepro__ohe_pipe__ohe_encoder__categories', 'prepro__ohe_pipe__ohe_encoder__drop', 'prepro__ohe_pipe__ohe_encoder__dtype', 'prepro__ohe_pipe__ohe_encoder__handle_unknown', 'prepro__ohe_pipe__ohe_encoder__sparse', 'prepro__num_pipe__memory', 'prepro__num_pipe__steps', 'prepro__num_pipe__verbose', 'prepro__num_pi

In [70]:
from sklearn.model_selection import RandomizedSearchCV
distributions_ridge = dict(model_ridge__solver = ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"], model_ridge__max_iter = np.arange(100, 10000, 100), model_ridge__alpha = np.linspace(0,1))

ridge_search = RandomizedSearchCV(pipe_ridge, distributions_ridge, random_state=0, scoring=classic_rmse)
ridge_search.fit(X_train, y_train)



In [75]:
print(f"Best Score : {ridge_search.best_score_}")
print(f"Best Parameters : {ridge_search.best_params_}")

Best Score : -0.12843732904810506
Best Parameters : {'model_ridge__solver': 'cholesky', 'model_ridge__max_iter': 5100, 'model_ridge__alpha': 0.9387755102040816}


# SVR

## Entrainement et prédiction avec split

In [76]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [78]:
pipe_svr.fit(X_train, y_train)

In [79]:
cross_val_score(pipe_svr, X_train, y_train, scoring=classic_rmse)

array([-0.16146011, -0.13563572, -0.14455032, -0.1458673 , -0.13460105])

In [81]:
y_pred = pipe_svr.predict(X_val)

In [82]:
custom_score(y_val, y_pred)

0.1474950174302717

## Optimisation du modèle - à faire

### Randomized Search

In [83]:
pipe_svr.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'prepro', 'model_svr', 'prepro__n_jobs', 'prepro__remainder', 'prepro__sparse_threshold', 'prepro__transformer_weights', 'prepro__transformers', 'prepro__verbose', 'prepro__ohe_pipe', 'prepro__num_pipe', 'prepro__ordinal_pipe', 'prepro__ohe_pipe__memory', 'prepro__ohe_pipe__steps', 'prepro__ohe_pipe__verbose', 'prepro__ohe_pipe__ohe_imputer', 'prepro__ohe_pipe__ohe_encoder', 'prepro__ohe_pipe__ohe_imputer__add_indicator', 'prepro__ohe_pipe__ohe_imputer__copy', 'prepro__ohe_pipe__ohe_imputer__fill_value', 'prepro__ohe_pipe__ohe_imputer__missing_values', 'prepro__ohe_pipe__ohe_imputer__strategy', 'prepro__ohe_pipe__ohe_imputer__verbose', 'prepro__ohe_pipe__ohe_encoder__categories', 'prepro__ohe_pipe__ohe_encoder__drop', 'prepro__ohe_pipe__ohe_encoder__dtype', 'prepro__ohe_pipe__ohe_encoder__handle_unknown', 'prepro__ohe_pipe__ohe_encoder__sparse', 'prepro__num_pipe__memory', 'prepro__num_pipe__steps', 'prepro__num_pipe__verbose', 'prepro__num_pipe

In [70]:
from sklearn.model_selection import RandomizedSearchCV
distributions_svr = dict(model_svr_kernel = ["linear", "poly", "rbf", "sigmoid", "precomputed"], model_svr__max_iter = np.arange(100, 10000, 100), model_svr__C = np.linspace(0,1))

svr_random_search = RandomizedSearchCV(pipe_ridge, distributions_ridge, random_state=0, scoring=classic_rmse)
svr_random_search.fit(X_train, y_train)

{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}

In [75]:
print(f"Best Score : {svr_random_search.best_score_}")
print(f"Best Parameters : {svr_random_search.best_params_}")

Best Score : -0.12843732904810506
Best Parameters : {'model_ridge__solver': 'cholesky', 'model_ridge__max_iter': 5100, 'model_ridge__alpha': 0.9387755102040816}


# Random Forest

## Entrainement et prédiction avec split

In [85]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [86]:
pipe_forest.fit(X_train, y_train)

In [87]:
cross_val_score(pipe_forest, X_train, y_train, scoring=classic_rmse)

array([-0.14841166, -0.14071319, -0.14335062, -0.15686144, -0.12119801])

In [88]:
y_pred = pipe_forest.predict(X_val)

In [89]:
custom_score(y_val, y_pred)

0.1340069933505423

## Optimisation des hyperparamières - A faire

# Modèles d'ensemble - à 