In [25]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit

import numpy as np

In [26]:
years = [18,19,20,21]
frames = []
for year in years:
    frames.append(
        pd.read_csv(f'../assets/data/valeursfoncieres-20{year}.txt', sep='|', decimal=',', nrows=10000, low_memory=False)
    )
df = pd.concat(frames)
sDF = df.copy()
df.shape

(40000, 43)

In [27]:
dfAddress = pd.read_csv('../assets/data/communes-departement-region.csv', sep=',')

In [28]:
df['key'] = df['Date mutation'].astype('str') + df['Type de voie'].astype('str') + df['Voie'].astype('str') + df['Code postal'].astype('str')
df = df.groupby('key').filter(lambda x : len(x) == 1)
df.shape

(10379, 44)

In [29]:
df = df[df['Valeur fonciere'] > 1]
df.shape

(10194, 44)

In [30]:
missingValueRate = (df.isna().sum() / df.shape[0]) * 100

In [31]:
df = df[df.columns[missingValueRate < 65]]
df.shape

(10194, 22)

In [32]:
dfAddress = dfAddress.drop_duplicates(subset=['code_departement'])
dfAddress.shape

(105, 15)

In [33]:
dfAddress.code_departement = dfAddress.code_departement.str.pad(2, side='left', fillchar='0')

In [34]:
df['Code departement'] = df['Code departement'].astype('str').str.pad(2, side='left', fillchar='0')

In [35]:
pd.set_option('display.max_columns', None)
dfAll = df.merge(dfAddress.loc[:, ['code_departement', 'code_region', 'nom_region']], how='left', left_on='Code departement', right_on='code_departement')
dfAll.head()

Unnamed: 0,No disposition,Date mutation,Nature mutation,Valeur fonciere,No voie,Type de voie,Code voie,Voie,Code postal,Commune,Code departement,Code commune,Section,No plan,Nombre de lots,Code type local,Type local,Surface reelle bati,Nombre pieces principales,Nature culture,Surface terrain,key,code_departement,code_region,nom_region
0,2,10/01/2018,Vente,3150.0,,,B077,PONT D AIN,1160.0,PONT-D AIN,1,304,AM,461,0,,,,,S,126.0,10/01/2018nanPONT D AIN1160.0,1,84.0,Auvergne-Rhône-Alpes
1,2,12/01/2018,Vente,2100.0,,,B135,SOUS LE BOIS GIROUD,1250.0,JASSERON,1,195,C,2066,0,,,,,BT,197.0,12/01/2018nanSOUS LE BOIS GIROUD1250.0,1,84.0,Auvergne-Rhône-Alpes
2,1,04/01/2018,Vente,67000.0,12.0,ALL,3044,DE LA PETITE REYSSOUZE,1000.0,BOURG-EN-BRESSE,1,53,BD,227,1,2.0,Appartement,45.0,1.0,,,04/01/2018ALLDE LA PETITE REYSSOUZE1000.0,1,84.0,Auvergne-Rhône-Alpes
3,1,11/01/2018,Vente,76200.0,5.0,RUE,2690,MOLIERE,1000.0,BOURG-EN-BRESSE,1,53,AP,152,2,2.0,Appartement,68.0,3.0,,,11/01/2018RUEMOLIERE1000.0,1,84.0,Auvergne-Rhône-Alpes
4,1,17/01/2018,Vente,1000.0,,,B112,VACAGNOLE,1340.0,ATTIGNAT,1,24,AL,106,0,,,,,AB,5093.0,17/01/2018nanVACAGNOLE1340.0,1,84.0,Auvergne-Rhône-Alpes


In [36]:
dataset = dfAll[['Date mutation', 'Nature mutation', 'Valeur fonciere', 'Nombre de lots', 'Code commune',
           'Code type local', 'Type local', 'Surface reelle bati', 'Nombre pieces principales', 'Nature culture',
           'Surface terrain', 'nom_region', 'code_region']]
dataset.head()

Unnamed: 0,Date mutation,Nature mutation,Valeur fonciere,Nombre de lots,Code commune,Code type local,Type local,Surface reelle bati,Nombre pieces principales,Nature culture,Surface terrain,nom_region,code_region
0,10/01/2018,Vente,3150.0,0,304,,,,,S,126.0,Auvergne-Rhône-Alpes,84.0
1,12/01/2018,Vente,2100.0,0,195,,,,,BT,197.0,Auvergne-Rhône-Alpes,84.0
2,04/01/2018,Vente,67000.0,1,53,2.0,Appartement,45.0,1.0,,,Auvergne-Rhône-Alpes,84.0
3,11/01/2018,Vente,76200.0,2,53,2.0,Appartement,68.0,3.0,,,Auvergne-Rhône-Alpes,84.0
4,17/01/2018,Vente,1000.0,0,24,,,,,AB,5093.0,Auvergne-Rhône-Alpes,84.0


In [37]:
target = 'Type local'
# target = 'Code type local'

In [38]:
classSet = dataset[dataset[target].notna()]
classSet.shape

(5148, 13)

In [39]:
classSet = classSet[['Nombre de lots', 'Nombre pieces principales', target]]

In [40]:
classSet.describe()

Unnamed: 0,Nombre de lots,Nombre pieces principales
count,5148.0,5141.0
mean,0.466977,3.425015
std,1.201271,1.826967
min,0.0,0.0
25%,0.0,2.0
50%,0.0,4.0
75%,1.0,5.0
max,41.0,15.0


In [41]:
Xtrain, Xtest, ytrain, ytest = train_test_split(classSet.drop(target, axis=1), classSet[target], stratify=classSet[target], test_size=0.3 ,random_state=123)

In [42]:
Xtrain['Nombre pieces principales'].fillna(Xtrain['Nombre pieces principales'].median(), inplace=True)
Xtest['Nombre pieces principales'].fillna(Xtest['Nombre pieces principales'].median(), inplace=True)


In [43]:
def evaluate(model):
    model.fit(Xtrain, ytrain)
    ypred = model.predict(Xtest)
    print(confusion_matrix(ypred, ytest))
    print(classification_report(ypred, ytest))
    # N, trainScore, valScore = learning_curve(model, Xtrain, ytrain,cv=3,train_sizes=np.linspace(0.1, 1, 10), scoring='f1_macro')
    # plt.figure(figsize=(16, 8))
    # plt.plot(N, trainScore.mean(axis=1), label='train score', color='red')
    # plt.plot(N, valScore.mean(axis=1), label='validation score')
    # plt.show()


In [44]:
arbre = DecisionTreeClassifier(max_depth=7, min_samples_leaf=5, min_samples_split=100, random_state=123)
evaluate(arbre)

[[ 273    0    0   35]
 [   0   63   21    0]
 [   0   20   80    1]
 [   0    0    3 1049]]
                                          precision    recall  f1-score   support

                             Appartement       1.00      0.89      0.94       308
                              Dépendance       0.76      0.75      0.75        84
Local industriel. commercial ou assimilé       0.77      0.79      0.78       101
                                  Maison       0.97      1.00      0.98      1052

                                accuracy                           0.95      1545
                               macro avg       0.87      0.86      0.86      1545
                            weighted avg       0.95      0.95      0.95      1545



In [45]:
# import pickle

In [46]:
# with open("models/decision-tree.pkl", "wb") as f:
#     pickle.dump({'model': arbre}, f)

In [47]:
# ypred = arbre.predict(Xtest)
# confusion_matrix(ypred, ytest)