In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import date

from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeClassifier

from imblearn.ensemble import BalancedRandomForestClassifier
import dataset_fonctions as dtf


In [None]:
## Read csvs

train = gpd.read_file('train.geojson', index_col=0)
test = gpd.read_file('test.geojson', index_col=0)
train_df = train.copy()
test_df = test.copy()

In [None]:
change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5}
train_df['change_type']=train_df['change_type'].apply(lambda x: change_type_map[x])


In [None]:
#changement nom des colonnes pour un codage plus simple

columns={'change_status_date5': 'status5','change_status_date4': 'status4', 'change_status_date3': 'status3','change_status_date2': 'status2','change_status_date1': 'status1'}

train_df = train_df.rename(columns=columns)
test_df = test_df.rename(columns=columns)

In [None]:
#assigne des valeurs numériques aux différents états

dico = {'Prior Construction':0,'Greenland':2, 'Excavation':1,  'Land Cleared':3, 'Materials Dumped':4, 'Construction Started':5, 'Construction Midway':6, 'Construction Done':7, 'Operational':8}

dtf.mapping(train_df,test_df,dico)

In [None]:
#### transformer les dates de la df en format de date :
   
dtf.formater_dates_df(train_df)
dtf.formater_dates_df(test_df)

### on cherche ici à rajouter des colonnes correspondant au nombres de jours entre deux dates consécutives.

dtf.rajout_timedelta(train_df)
dtf.rajout_timedelta(test_df)

In [None]:
train_df=train_df.dropna(subset=['status5', 'status4','status3','status2','status1'])


In [None]:
train_df[['geometry']]=train_df[['geometry']].to_crs(epsg=9834)
test_df[['geometry']]=test_df[['geometry']].to_crs(epsg=9834)

train_df['area']=train_df[['geometry']].area
test_df['area']=test_df[['geometry']].area
train_df['length']=train_df[['geometry']].length
test_df['length']=test_df[['geometry']].length

In [None]:
test_df['1/l']=1/test_df['length']
train_df['1/l']=1/train_df['length']


In [None]:
#on remplace les valeurs d'aires et de surface non définie par le quantile 50%

test_df = test_df.replace(np.inf, np.nan)

test_df['area']=test_df['area'].fillna(test_df['area'].quantile())
test_df['length']=test_df['length'].fillna(test_df['length'].quantile())

imputer = KNNImputer(n_neighbors=10)
test_df[['status5','status4','status3','status2','status1','time_delta_21','time_delta_32','time_delta_43','time_delta_54','area','length','1/l']]=imputer.fit_transform(test_df[['status5','status4','status3','status2','status1','time_delta_21','time_delta_32','time_delta_43','time_delta_54','area','length','1/l']])

In [None]:
demolition_df = train_df[train_df['change_type'] == 0]
road_df = train_df[train_df['change_type'] == 1]
residential_df = train_df[train_df['change_type'] == 2]
commercial_df = train_df[train_df['change_type'] == 3]
industrial_df = train_df[train_df['change_type'] == 4]
mega_projects_df = train_df[train_df['change_type'] == 5]

In [None]:
list_df = [demolition_df,road_df,residential_df,commercial_df,industrial_df,mega_projects_df]

for col in ['area','length','a/l','a/ll','sa/l']:
    plt.figure() 
    i=0
    for df in list_df:
        sns.distplot(df[(df['area']<800) | (df['length']<0.1)][col],label = i)
        i+=1
    plt.legend()

In [None]:
poly = PolynomialFeatures(3)
geometry_ppt = poly.fit_transform(train_df[['area','length','1/l']])
geometry_ppt_test = poly.fit_transform(test_df[['area','length','1/l']])

In [None]:
#split les valeurs nominales d'urban et de geography pour pouvoir la traiter correctement avec un MultiLabelBinarizer
    
dtf.split_nominal(train_df,test_df)
urban_types_mlb, urban_types_mlb_test, geography_types_mlb, geography_types_mlb_test = dtf.mlb_urban_geography(train_df,test_df)

In [None]:
## création des paramètres à rentrer dans l'algorithme de ML

A = np.asarray(train_df[['status5','status4','status3','status2','status1','time_delta_21','time_delta_32','time_delta_43','time_delta_54']])
train_x = np.concatenate((A,geometry_ppt,urban_types_mlb,geography_types_mlb), axis=1)
print(train_x.shape)
train_y = train_df['change_type']


B = np.asarray(test_df[['status5','status4','status3','status2','status1','time_delta_21','time_delta_32','time_delta_43','time_delta_54']])
test_x = np.concatenate((B,geometry_ppt_test,urban_types_mlb_test,geography_types_mlb_test), axis=1)

In [None]:
preproceseur = make_pipeline(RobustScaler(),SelectKBest(k=20))
#DTC = DecisionTreeClassifier(max_depth=12,max_features=6)
model = make_pipeline(RandomForestClassifier(random_state=42))


#param_grid={'randomforestclassifier__max_depth':np.arange(6,20,4),'randomforestclassifier__max_features':np.arange(3,6),'pipeline__selectkbest__k':[5,6]}

param_grid={'randomforestclassifier__n_estimators':[190],'randomforestclassifier__max_depth':[9],'randomforestclassifier__max_features':[11],'randomforestclassifier__min_samples_split':[202]}
grid = RandomizedSearchCV(model,param_grid,cv=5,scoring='f1_micro',n_iter=1)

grid.fit(train_x, train_y)

print(grid.best_score_)
grid.best_params_

In [None]:
DTC = DecisionTreeClassifier(max_depth=5,max_features=18,min_samples_split=3860)

preproceseur = make_pipeline(RobustScaler())
model = make_pipeline(preproceseur,RandomForestClassifier(random_state=42,n_estimators=200,max_depth=9,max_features=11,min_samples_split=500))
#model = make_pipeline(AdaBoostClassifier(random_state=42,base_estimator=DTC,n_estimators=8))

model.fit(train_x, train_y)
pred_y = model.predict(test_x)


## Save results to submission file
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("knn_sample_submission.csv", index=True, index_label='Id')

In [None]:
preproceseur = make_pipeline(RobustScaler())
DTC = DecisionTreeClassifier()
model = make_pipeline(DTC)



#param_grid={'randomforestclassifier__max_depth':np.arange(6,20,4),'randomforestclassifier__max_features':np.arange(3,6),'pipeline__selectkbest__k':[5,6]}

param_grid={'decisiontreeclassifier__max_depth':np.arange(1,6),'decisiontreeclassifier__max_features':np.arange(10,20),'decisiontreeclassifier__min_samples_split':np.arange(10,5000,50)}
grid = RandomizedSearchCV(model,param_grid,cv=4,scoring='f1_micro',n_iter=50)

grid.fit(train_x, train_y)

print(grid.best_score_)
grid.best_params_

In [None]:
preproceseur = make_pipeline(RobustScaler())
DTC = DecisionTreeClassifier(max_depth=5,max_features=18,min_samples_split=3860)
model = make_pipeline(preproceseur,GradientBoostingClassifier(random_state=42))


#param_grid={'randomforestclassifier__max_depth':np.arange(6,20,4),'randomforestclassifier__max_features':np.arange(3,6),'pipeline__selectkbest__k':[5,6]}

param_grid={'gradientboostingclassifier__n_estimators':[50,100,200],'gradientboostingclassifier__max_depth':[3,6,9],'gradientboostingclassifier__max_features':[7,6,8],'gradientboostingclassifier__min_samples_split':[10,100,500]}
grid = RandomizedSearchCV(model,param_grid,cv=4,scoring='f1_micro',n_iter=3)

grid.fit(train_x, train_y)

print(grid.best_score_)
grid.best_params_