# Modèle

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from statsmodels.tools.tools import add_constant
from statsmodels.robust.robust_linear_model import RLM

## Sans enrichissement

In [2]:
data = pd.read_pickle('store/after_analyze.pkl.xz')
data.reset_index(inplace=True, drop=True)
data.head()

Unnamed: 0,date_mutation,valeur_fonciere,adresse_numero,adresse_suffixe,adresse_nom_voie,adresse_code_voie,code_postal,nom_commune,lot1_surface_carrez,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,longitude,latitude
0,2014-01-02,194400.0,23,,BD EMILE ROMANET,2794,44100,Nantes,,Appartement,84.0,4,,-1.603261,47.209692
1,2014-01-02,107000.0,11,,RUE DU DOCTEUR ALFRED CORLAY,882,44800,Saint-Herblain,45.8,Appartement,46.0,2,,-1.644254,47.207462
2,2014-01-09,208154.0,38,,RUE DE LA PLANCHE AU GUE,5555,44300,Nantes,103.64,Appartement,103.0,5,,-1.519753,47.272364
3,2014-01-06,79000.0,8,,RUE DES CARMELITES,1436,44000,Nantes,26.45,Appartement,25.0,1,,-1.551322,47.216626
4,2014-01-02,335000.0,23,,RUE DES CANARIS,1390,44300,Nantes,,Maison,118.0,5,562.0,-1.506824,47.231122


On supprime les colonnes portant des informations géographiques :
* adresse (n° de voie, nom de voie, code postale, commune,ect)
* coordonnées

On supprime la colonne : **lot1_surface_carrez**

In [3]:
data.drop(['adresse_numero', 'adresse_suffixe', 'adresse_nom_voie', 'adresse_code_voie', 'code_postal', 'nom_commune', 'longitude', 'latitude', 'lot1_surface_carrez'], axis=1, inplace=True)
data.head()

Unnamed: 0,date_mutation,valeur_fonciere,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain
0,2014-01-02,194400.0,Appartement,84.0,4,
1,2014-01-02,107000.0,Appartement,46.0,2,
2,2014-01-09,208154.0,Appartement,103.0,5,
3,2014-01-06,79000.0,Appartement,25.0,1,
4,2014-01-02,335000.0,Maison,118.0,5,562.0


On récupère l'année et le mois de la vente.

In [4]:
data['annee'] = data.apply(lambda x : int(x['date_mutation'][:4]), axis=1)
data['mois'] = data.apply(lambda x : int(x['date_mutation'][5:7]), axis=1)
data.drop('date_mutation', axis=1, inplace=True)
data.head()

Unnamed: 0,valeur_fonciere,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,annee,mois
0,194400.0,Appartement,84.0,4,,2014,1
1,107000.0,Appartement,46.0,2,,2014,1
2,208154.0,Appartement,103.0,5,,2014,1
3,79000.0,Appartement,25.0,1,,2014,1
4,335000.0,Maison,118.0,5,562.0,2014,1


On remplace les NaN de la colonne **surface_terrain** par la valeur correspondante de la colonne **surface_reelle_bati**.

In [5]:
data['surface_terrain'].fillna(data['surface_reelle_bati'], inplace=True)
data.head()

Unnamed: 0,valeur_fonciere,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,annee,mois
0,194400.0,Appartement,84.0,4,84.0,2014,1
1,107000.0,Appartement,46.0,2,46.0,2014,1
2,208154.0,Appartement,103.0,5,103.0,2014,1
3,79000.0,Appartement,25.0,1,25.0,2014,1
4,335000.0,Maison,118.0,5,562.0,2014,1


On encode la colonne **type_local**.

In [6]:
one_hot = pd.get_dummies(data['type_local'])
data.drop('type_local', axis = 1, inplace=True)
data = data.join(one_hot)
data.head()

Unnamed: 0,valeur_fonciere,surface_reelle_bati,nombre_pieces_principales,surface_terrain,annee,mois,Appartement,Maison
0,194400.0,84.0,4,84.0,2014,1,1,0
1,107000.0,46.0,2,46.0,2014,1,1,0
2,208154.0,103.0,5,103.0,2014,1,1,0
3,79000.0,25.0,1,25.0,2014,1,1,0
4,335000.0,118.0,5,562.0,2014,1,0,1


Création des jeux d'entrainement et des jeux de test.

In [7]:
data_train, data_test = train_test_split(data, test_size=0.25, random_state=42)
X_train = data_train[['surface_reelle_bati', 'nombre_pieces_principales', 'surface_terrain', 'annee', 'mois', 'Maison']]
y_train = data_train['valeur_fonciere']
X_test = data_test[['surface_reelle_bati', 'nombre_pieces_principales', 'surface_terrain', 'annee', 'mois', 'Maison']]
y_test = data_test['valeur_fonciere']

On génère un polynôme à partir de X_train et X_test.

In [8]:
def build_polynomial(df_x, l_columns, i_degree):
    df_polynomial = pd.DataFrame()
    for n in l_columns:
        for d in range(1, i_degree + 1):
            df_polynomial[n+'^'+str(d)] = np.float_power(df_x[n], d)
    df_polynomial = df_polynomial.join(df_x[[c for c in df_x if c not in l_columns]])
    return df_polynomial

X_train = build_polynomial(X_train, ['surface_reelle_bati', 'nombre_pieces_principales', 'surface_terrain', 'annee', 'mois'], 3)
X_test = build_polynomial(X_test, ['surface_reelle_bati', 'nombre_pieces_principales', 'surface_terrain', 'annee', 'mois'], 3)

X_train.head()

Unnamed: 0,surface_reelle_bati^1,surface_reelle_bati^2,surface_reelle_bati^3,nombre_pieces_principales^1,nombre_pieces_principales^2,nombre_pieces_principales^3,surface_terrain^1,surface_terrain^2,surface_terrain^3,annee^1,annee^2,annee^3,mois^1,mois^2,mois^3,Maison
31956,20.0,400.0,8000.0,1.0,1.0,1.0,20.0,400.0,8000.0,2018.0,4072324.0,8217950000.0,5.0,25.0,125.0,0
31565,37.0,1369.0,50653.0,2.0,4.0,8.0,37.0,1369.0,50653.0,2018.0,4072324.0,8217950000.0,4.0,16.0,64.0,0
5665,70.0,4900.0,343000.0,5.0,25.0,125.0,70.0,4900.0,343000.0,2014.0,4056196.0,8169179000.0,10.0,100.0,1000.0,1
3984,259.0,67081.0,17373979.0,4.0,16.0,64.0,4102.0,16826404.0,69021910000.0,2014.0,4056196.0,8169179000.0,1.0,1.0,1.0,1
7185,90.0,8100.0,729000.0,6.0,36.0,216.0,315.0,99225.0,31255880.0,2015.0,4060225.0,8181353000.0,3.0,9.0,27.0,1


Sélection avant-arrière des variables explicatives.

In [9]:
def compute_rlm(df_x, df_y):
    df_x =  add_constant(df_x)
    sm_model = RLM(df_y, df_x).fit()
    return sm_model

def stepwise_selection(df_x, df_y, l_initialFeatures=[], f_thresholdIn=0.01, f_thresholdOut=0.05):
    l_selectedFeatures = [x for x in l_initialFeatures]
    b_hasChanged = True
    while b_hasChanged:
        b_hasChanged = False
        # Forward step
        l_excludedFeatures = list(set(df_x.columns)-set(l_selectedFeatures))
        df_pval = pd.Series(index = l_excludedFeatures)
        for s_column in l_excludedFeatures:
            df_newX = df_x[l_selectedFeatures + [s_column]]
            sm_model = compute_rlm(df_newX, df_y)
            df_pval[s_column] = sm_model.pvalues[s_column]
        f_bestPval = df_pval.min()
        if f_bestPval < f_thresholdIn:
            s_bestFeature = df_pval.idxmin()
            l_selectedFeatures.append(s_bestFeature)
            b_hasChanged = True
        # Backward step
        sm_model = compute_rlm(df_x[l_selectedFeatures], df_y)
        # Use all coefs except intercept
        df_pval = sm_model.pvalues.iloc[1:]
        f_worstPval = df_pval.max()
        if f_worstPval > f_thresholdOut:
            b_hasChanged = True
            s_worstFeature = df_pval.idxmax()
            l_selectedFeatures.remove(s_worstFeature)
    return l_selectedFeatures

features = stepwise_selection(X_train, y_train)
print(features)
model = compute_rlm(X_train[features], y_train)

  return ptp(axis=axis, out=out, **kwargs)


['surface_reelle_bati^2', 'nombre_pieces_principales^1', 'surface_terrain^1', 'surface_reelle_bati^1', 'surface_terrain^2', 'surface_reelle_bati^3', 'surface_terrain^3', 'annee^1', 'annee^3', 'annee^2', 'Maison', 'mois^1']


In [10]:
model.summary()

0,1,2,3
Dep. Variable:,valeur_fonciere,No. Observations:,29154.0
Model:,RLM,Df Residuals:,29147.0
Method:,IRLS,Df Model:,6.0
Norm:,HuberT,,
Scale Est.:,mad,,
Cov Type:,H1,,
Date:,"Tue, 14 May 2019",,
Time:,10:36:19,,
No. Iterations:,11,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0001,1.87e-05,7.411,0.000,0.000,0.000
surface_reelle_bati^2,6.3526,0.271,23.466,0.000,5.822,6.883
nombre_pieces_principales^1,2448.4179,493.580,4.961,0.000,1481.019,3415.817
surface_terrain^1,50.6954,1.227,41.314,0.000,48.290,53.101
surface_reelle_bati^1,1154.9828,50.562,22.843,0.000,1055.883,1254.082
surface_terrain^2,-0.0020,0.000,-19.489,0.000,-0.002,-0.002
surface_reelle_bati^3,-0.0113,0.000,-26.051,0.000,-0.012,-0.010
surface_terrain^3,1.495e-08,9.12e-10,16.390,0.000,1.32e-08,1.67e-08
annee^1,0.0919,0.013,7.308,0.000,0.067,0.117


Erreur quadratique moyenne :

In [11]:
X_test = add_constant(X_test[features])
y_test = y_test.values
y_pred = model.predict(X_test).values

np.sqrt(((y_test - y_pred)**2).sum()/y_test.shape[0])

586610.4487001512