In [50]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit

import numpy as np

In [23]:
years = [18,19,20,21]
frames = []
for year in years:
    frames.append(
        pd.read_csv(f'./data/valeursfoncieres-20{year}.txt', sep='|', decimal=',', nrows=1000000, low_memory=False)
    )
df = pd.concat(frames)
sDF = df.copy()
df.shape

(4000000, 44)

In [24]:
dfAddress = pd.read_csv('./data/communes-departement-region.csv', sep=',')

In [25]:
df['key'] = df['Date mutation'].astype('str') + df['Type de voie'].astype('str') + df['Voie'].astype('str') + df['Code postal'].astype('str')
df = df.groupby('key').filter(lambda x : len(x) == 1)
df.shape

(957585, 45)

In [26]:
df = df[df['Valeur fonciere'] > 1]
df.shape

(945755, 45)

In [27]:
missingValueRate = (df.isna().sum() / df.shape[0]) * 100

In [28]:
df = df[df.columns[missingValueRate < 65]]
df.shape

(945755, 22)

In [29]:
dfAddress = dfAddress.drop_duplicates(subset=['code_departement'])
dfAddress.shape

(105, 15)

In [30]:
dfAddress.code_departement = dfAddress.code_departement.str.pad(2, side='left', fillchar='0')

In [31]:
df['Code departement'] = df['Code departement'].astype('str').str.pad(2, side='left', fillchar='0')

In [32]:
pd.set_option('display.max_columns', None)
dfAll = df.merge(dfAddress.loc[:, ['code_departement', 'code_region', 'nom_region']], how='left', left_on='Code departement', right_on='code_departement')
dfAll.head()

Unnamed: 0,No disposition,Date mutation,Nature mutation,Valeur fonciere,No voie,Type de voie,Code voie,Voie,Code postal,Commune,Code departement,Code commune,Section,No plan,Nombre de lots,Code type local,Type local,Surface reelle bati,Nombre pieces principales,Nature culture,Surface terrain,key,code_departement,code_region,nom_region
0,2,10/01/2018,Vente,3150.0,,,B077,PONT D AIN,1160.0,PONT-D AIN,1,304,AM,461,0,,,,,S,126.0,10/01/2018nanPONT D AIN1160.0,1,84.0,Auvergne-Rhône-Alpes
1,2,12/01/2018,Vente,2100.0,,,B135,SOUS LE BOIS GIROUD,1250.0,JASSERON,1,195,C,2066,0,,,,,BT,197.0,12/01/2018nanSOUS LE BOIS GIROUD1250.0,1,84.0,Auvergne-Rhône-Alpes
2,1,04/01/2018,Vente,67000.0,12.0,ALL,3044,DE LA PETITE REYSSOUZE,1000.0,BOURG-EN-BRESSE,1,53,BD,227,1,2.0,Appartement,45.0,1.0,,,04/01/2018ALLDE LA PETITE REYSSOUZE1000.0,1,84.0,Auvergne-Rhône-Alpes
3,1,11/01/2018,Vente,76200.0,5.0,RUE,2690,MOLIERE,1000.0,BOURG-EN-BRESSE,1,53,AP,152,2,2.0,Appartement,68.0,3.0,,,11/01/2018RUEMOLIERE1000.0,1,84.0,Auvergne-Rhône-Alpes
4,1,17/01/2018,Vente,1000.0,,,B112,VACAGNOLE,1340.0,ATTIGNAT,1,24,AL,106,0,,,,,AB,5093.0,17/01/2018nanVACAGNOLE1340.0,1,84.0,Auvergne-Rhône-Alpes


In [33]:
dataset = dfAll[['Date mutation', 'Nature mutation', 'Valeur fonciere', 'Nombre de lots', 'Code commune',
           'Code type local', 'Type local', 'Surface reelle bati', 'Nombre pieces principales', 'Nature culture',
           'Surface terrain', 'nom_region', 'code_region']]
dataset.head()

Unnamed: 0,Date mutation,Nature mutation,Valeur fonciere,Nombre de lots,Code commune,Code type local,Type local,Surface reelle bati,Nombre pieces principales,Nature culture,Surface terrain,nom_region,code_region
0,10/01/2018,Vente,3150.0,0,304,,,,,S,126.0,Auvergne-Rhône-Alpes,84.0
1,12/01/2018,Vente,2100.0,0,195,,,,,BT,197.0,Auvergne-Rhône-Alpes,84.0
2,04/01/2018,Vente,67000.0,1,53,2.0,Appartement,45.0,1.0,,,Auvergne-Rhône-Alpes,84.0
3,11/01/2018,Vente,76200.0,2,53,2.0,Appartement,68.0,3.0,,,Auvergne-Rhône-Alpes,84.0
4,17/01/2018,Vente,1000.0,0,24,,,,,AB,5093.0,Auvergne-Rhône-Alpes,84.0


In [34]:
target = 'Type local'
# target = 'Code type local'

In [35]:
classSet = dataset[dataset[target].notna()]
classSet.shape

(560031, 13)

In [36]:
classSet = classSet[['Nombre de lots', 'Nombre pieces principales', target]]

In [57]:
classSet.describe()

Unnamed: 0,Nombre de lots,Nombre pieces principales
count,560031.0,560031.0
mean,0.607549,3.035246
std,0.975432,1.898442
min,0.0,0.0
25%,0.0,2.0
50%,0.0,3.0
75%,1.0,4.0
max,141.0,67.0


In [38]:
Xtrain, Xtest, ytrain, ytest = train_test_split(classSet.drop(target, axis=1), classSet[target], stratify=classSet[target], test_size=0.3 ,random_state=123)

In [56]:
Xtrain['Nombre pieces principales'].fillna(Xtrain['Nombre pieces principales'].median(), inplace=True)
Xtest['Nombre pieces principales'].fillna(Xtest['Nombre pieces principales'].median(), inplace=True)


In [58]:
def evaluate(model):
    model.fit(Xtrain, ytrain)
    ypred = model.predict(Xtest)
    print(confusion_matrix(ypred, ytest))
    print(classification_report(ypred, ytest))
    # N, trainScore, valScore = learning_curve(model, Xtrain, ytrain,cv=3,train_sizes=np.linspace(0.1, 1, 10), scoring='f1_macro')
    # plt.figure(figsize=(16, 8))
    # plt.plot(N, trainScore.mean(axis=1), label='train score', color='red')
    # plt.plot(N, valScore.mean(axis=1), label='validation score')
    # plt.show()


In [88]:
treeParams = {
    'max_depth': np.arange(2,10,1),
    'min_samples_leaf': np.arange(5,500,100),
    'min_samples_split': np.arange(100,500,50),
    'random_state': [123]
}

In [90]:
arbre_ = DecisionTreeClassifier()

In [91]:
gsArbre = GridSearchCV(arbre_, param_grid=treeParams, cv=ShuffleSplit(test_size=0.3, n_splits=1, random_state=0), verbose=1)
gsArbre.fit(Xtrain, ytrain)

Fitting 1 folds for each of 320 candidates, totalling 320 fits


In [92]:
# DecisionTreeClassifier(max_depth=7, min_samples_leaf=5, min_samples_split=100, random_state=123)
gsArbre.best_estimator_

In [93]:
evaluate(gsArbre.best_estimator_)

[[45711     0     4  3873]
 [   51 12789  3812     7]
 [   14  2889  5952    99]
 [  874     0   160 91775]]
                                          precision    recall  f1-score   support

                             Appartement       0.98      0.92      0.95     49588
                              Dépendance       0.82      0.77      0.79     16659
Local industriel. commercial ou assimilé       0.60      0.66      0.63      8954
                                  Maison       0.96      0.99      0.97     92809

                                accuracy                           0.93    168010
                               macro avg       0.84      0.84      0.84    168010
                            weighted avg       0.93      0.93      0.93    168010



In [96]:
params = {
    'n_estimators': np.arange(5, 50, 10),
    'max_depth': np.arange(5,10,2),
    'min_samples_leaf': np.arange(5,500,100),
    'min_samples_split': np.arange(100,1000,100),
    'random_state': [0]
}

In [97]:
rndf_ = RandomForestClassifier()

In [85]:
gs1 = GridSearchCV(rndf_, param_grid=params, cv=ShuffleSplit(test_size=0.3, n_splits=1, random_state=0), verbose=1)
gs1.fit(Xtrain, ytrain)

Fitting 1 folds for each of 675 candidates, totalling 675 fits


In [98]:
# RandomForestClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=100, n_estimators=5, random_state=0)
gs1.best_estimator_

In [87]:
evaluate(gs1.best_estimator_)

[[45708     0     4  3876]
 [   51 12789  3812     7]
 [   14  2889  5952    99]
 [  877     0   160 91772]]
                                          precision    recall  f1-score   support

                             Appartement       0.98      0.92      0.95     49588
                              Dépendance       0.82      0.77      0.79     16659
Local industriel. commercial ou assimilé       0.60      0.66      0.63      8954
                                  Maison       0.96      0.99      0.97     92809

                                accuracy                           0.93    168010
                               macro avg       0.84      0.84      0.84    168010
                            weighted avg       0.93      0.93      0.93    168010



In [65]:
knn_ = KNeighborsClassifier()

In [72]:
params = {
    'n_neighbors': np.arange(2,10,1)
}

In [73]:
gs = GridSearchCV(knn_, param_grid=params, scoring='f1_macro', cv=ShuffleSplit(test_size=0.3, n_splits=1, random_state=0), verbose=1)
gs.fit(Xtrain, ytrain)

Fitting 1 folds for each of 8 candidates, totalling 8 fits


In [74]:
# KNeighborsClassifier(n_neighbors=8)
gs.best_estimator_

In [77]:
evaluate(gs.best_estimator_)

[[46165     0     4  4591]
 [   51 12789  3812     7]
 [   14  2889  5952    99]
 [  420     0   160 91057]]
                                          precision    recall  f1-score   support

                             Appartement       0.99      0.91      0.95     50760
                              Dépendance       0.82      0.77      0.79     16659
Local industriel. commercial ou assimilé       0.60      0.66      0.63      8954
                                  Maison       0.95      0.99      0.97     91637

                                accuracy                           0.93    168010
                               macro avg       0.84      0.83      0.84    168010
                            weighted avg       0.93      0.93      0.93    168010



In [99]:
import pickle