In [34]:
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Lasso, Ridge, ElasticNet
import pickle

In [2]:
df = pd.read_csv('../assets/data/df.csv', sep='|', low_memory=False)
dfC = df.copy()

In [3]:
df.shape

(3676922, 48)

Suppression des colonnes avec plus de 65% de NA

In [4]:
missingValueRate = df.isna().mean() * 100
df = df[df.columns[missingValueRate < 65]]
df.drop(['No disposition', 'No plan', 'Section', 'Nature culture', 'key', 'Unnamed: 0'], axis=1, inplace=True)

In [5]:
df = df[df['Nature mutation'] == 'Vente']

In [6]:
df = df[df['Type local'].notna()]

In [7]:
df = df[df['code_departement'].notna()]
df.shape

(2273401, 20)

In [8]:
def defineSurface(x):
    if (x['Surface reelle bati'] > 0):
        return x['Surface reelle bati']
    elif (x['Surface terrain'] > 0):
        return x['Surface terrain']
    else:
        return pd.NA

In [9]:
df['Surface'] = df.apply(defineSurface, axis=1)

In [10]:
df['metre carre'] = df['Valeur fonciere'] / df['Surface']

In [11]:
prix_carre = df.groupby(['code_departement'])['metre carre'].mean()
prix_carre = prix_carre.reset_index()

In [12]:
df.drop('metre carre', axis=1, inplace=True)

In [13]:
df = df.merge(prix_carre, how='left', on='code_departement')
df.head()

Unnamed: 0,Date mutation,Nature mutation,Valeur fonciere,No voie,Type de voie,Code voie,Voie,Code postal,Commune,Code departement,...,Code type local,Type local,Surface reelle bati,Nombre pieces principales,Surface terrain,code_departement,departement,total,Surface,metre carre
0,04/01/2018,Vente,67000.0,12.0,ALL,3044,DE LA PETITE REYSSOUZE,1000.0,BOURG-EN-BRESSE,1,...,2.0,Appartement,45.0,1.0,,1,Ain,647634.0,45.0,2294.079592
1,11/01/2018,Vente,76200.0,5.0,RUE,2690,MOLIERE,1000.0,BOURG-EN-BRESSE,1,...,2.0,Appartement,68.0,3.0,,1,Ain,647634.0,68.0,2294.079592
2,12/01/2018,Vente,130000.0,10.0,RUE,130,DE LA POMPE,1160.0,VARAMBON,1,...,1.0,Maison,80.0,3.0,55.0,1,Ain,647634.0,80.0,2294.079592
3,04/01/2018,Vente,164370.0,56.0,ALL,134,DU COURLIS CENDRE,1290.0,CORMORANCHE-SUR-SAONE,1,...,1.0,Maison,88.0,4.0,419.0,1,Ain,647634.0,88.0,2294.079592
4,15/01/2018,Vente,97000.0,9.0,RUE,133,DU CHATEAU D EAU,1750.0,SAINT-LAURENT-SUR-SAONE,1,...,2.0,Appartement,90.0,4.0,,1,Ain,647634.0,90.0,2294.079592


In [14]:
df.shape

(2273401, 22)

In [15]:
df.drop(df[((df['Code type local'] != 3) & (df['Surface'].isna()))].index, axis=0, inplace=True)
df.shape

(2273169, 22)

In [16]:
df['metre carre'] = df['metre carre'].astype('float')

In [17]:
# df.to_csv('./assets/data/df-regression.csv', sep='|', index=False)

In [18]:
df2 = df[['Surface reelle bati', 'Surface', 'metre carre', 'Nombre pieces principales', 'Valeur fonciere', 'total', 'Code departement', 'Type local']][df['Type local'].isin(['Maison', 'Appartement'])]
df2.head()


Unnamed: 0,Surface reelle bati,Surface,metre carre,Nombre pieces principales,Valeur fonciere,total,Code departement,Type local
0,45.0,45.0,2294.079592,1.0,67000.0,647634.0,1,Appartement
1,68.0,68.0,2294.079592,3.0,76200.0,647634.0,1,Appartement
2,80.0,80.0,2294.079592,3.0,130000.0,647634.0,1,Maison
3,88.0,88.0,2294.079592,4.0,164370.0,647634.0,1,Maison
4,90.0,90.0,2294.079592,4.0,97000.0,647634.0,1,Appartement


In [19]:
df2.isna().sum()

Surface reelle bati          14
Surface                       0
metre carre                   0
Nombre pieces principales    14
Valeur fonciere               0
total                         0
Code departement              0
Type local                    0
dtype: int64

In [20]:
for col in df2.select_dtypes(exclude='object'):
    q1 = df2[col].quantile(q=0.25)
    q3 = df2[col].quantile(q=0.75)

    IQR = q3 - q1

    borne_inf = q1 - 1.5 * IQR
    borne_sup = q3 + 1.5 * IQR

    df2 = df2[df2[col] < borne_sup]
    df2 = df2[df2[col] > borne_inf]

    df2[col].fillna(df2[col].median(), inplace=True)
df2.shape

(1669947, 8)

In [76]:
df3 = df[['Surface reelle bati', 'Surface terrain', 'Surface', 'metre carre', 'Valeur fonciere', 'total', 'Code departement', 'Type local', 'Code type local']]
df3.shape

(2273169, 9)

In [78]:
df3 = df3[df3['Surface'].notna()]

In [124]:
df3 = df3[df3['Code type local'].notna()]

In [126]:
df3.isna().sum()

metre carre        0
Valeur fonciere    0
total              0
Surface            0
Code type local    0
dtype: int64

In [127]:
df3['Surface'] = df3['Surface'].astype('float')

In [128]:
df4 = df3.copy()

In [129]:
df3 = df4.copy()

In [171]:
df3 = df3[['metre carre',	'Valeur fonciere', 'total', 'Surface']]

In [172]:
df3.shape

(1698731, 4)

In [173]:
for col in df3[['metre carre',	'Valeur fonciere', 'total', 'Surface']]:
    q1 = df3[col].quantile(q=0.25)
    q3 = df3[col].quantile(q=0.75)

    IQR = q3 - q1

    borne_inf = q1 - 1.5 * IQR
    borne_sup = q3 + 1.5 * IQR

    df3 = df3[df3[col] < borne_sup]
    df3 = df3[df3[col] > borne_inf]
    df3[col].fillna(df3[col].median(), inplace=True)
df3.shape

(1695528, 4)

In [174]:
df3.corr()

Unnamed: 0,metre carre,Valeur fonciere,total,Surface
metre carre,1.0,0.359785,0.539765,-0.209273
Valeur fonciere,0.359785,1.0,0.226096,0.423052
total,0.539765,0.226096,1.0,-0.082886
Surface,-0.209273,0.423052,-0.082886,1.0


In [175]:
target = 'Valeur fonciere'

In [176]:
X = df3.drop(target, axis=1)
y = df3[target]

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [190]:
scaler = StandardScaler()
scaler.fit(X_train)

In [191]:
X_train_sc = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_train_sc.head()

Unnamed: 0,metre carre,total,Surface
0,-0.860231,-0.995381,-0.691658
1,0.295576,-0.536461,-0.504521
2,-0.591146,-0.757545,-1.128311
3,1.229314,0.131858,-1.596154
4,-0.646328,-0.49893,-0.130246


In [192]:
X_test_sc = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [193]:
arbre = DecisionTreeRegressor(max_depth=9, min_samples_leaf=5, min_samples_split=200, random_state=0)

In [194]:
arbre.fit(X_train_sc, y_train)

In [195]:
# Prédisez les valeurs sur l'ensemble de test avec le meilleur modèle
ypred = arbre.predict(X_test_sc)

# Évaluez les performances du modèle (par exemple, avec la MSE)
rmse = math.sqrt(mean_squared_error(y_test, ypred))
r2Score = r2_score(y_test, ypred)
print(rmse, r2Score)

68040.28362137037 0.43456688065240245


In [196]:
# with open("../assets/models/random-forest-regressor-tree-all.pkl", "wb") as f:
#     pickle.dump({'model': arbre, 'scaler' : scaler }, f)

In [189]:
ypred

array([219553.37476267, 203520.56566552, 225766.71029598, ...,
       142716.40993038, 264339.92641906,  60318.50722436])

In [184]:
arbre.feature_names_in_

array(['metre carre', 'total', 'Surface'], dtype=object)

In [185]:
arbre.feature_importances_

array([0.48855075, 0.02161394, 0.48983531])

In [186]:
rndF = RandomForestRegressor(max_depth=9, min_samples_leaf=5, min_samples_split=400, n_estimators=150, random_state=0)
rndF.fit(X_train_sc, y_train)

In [187]:
# Prédisez les valeurs sur l'ensemble de test avec le meilleur modèle
ypred = rndF.predict(X_test_sc)

# Évaluez les performances du modèle (par exemple, avec la MSE)
rmse = math.sqrt(mean_squared_error(y_test, ypred))
r2Score = r2_score(y_test, ypred)
print(rmse, r2Score)

67869.11718761426 0.43740818079091104


In [198]:
rndF.feature_names_in_

array(['metre carre', 'total', 'Surface'], dtype=object)

In [197]:
rndF.feature_importances_

array([0.48594445, 0.02436163, 0.48969391])

In [188]:
# with open("../assets/models/random-forest-regressor-all.pkl", "wb") as f:
#     pickle.dump({'model': rndF, 'scaler' : scaler }, f)