In [1]:
import pandas as pd
import geopandas as gpd
import xgboost
import sys
from sklearn.preprocessing import LabelEncoder
import numpy as np
from math import * 
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import DBSCAN, OPTICS

import json
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 250
plt.style.use('dark_background')

from utils import plot_cluster, load_list, save_list

In [2]:
n_data_max = 9999999999


In [3]:
train_df = pd.read_csv('train_df.csv',index_col=0)[:n_data_max]
test_df = pd.read_csv('test_df.csv',index_col=0)[:n_data_max]
train_fourier=pd.read_csv('fourier_coefficients_train.csv',index_col=0)[:n_data_max]
test_fourier=pd.read_csv('fourier_coefficients_test.csv',index_col=0)[:n_data_max]

train_df=pd.concat([train_df,train_fourier],axis=1)
test_df=pd.concat([test_df,test_fourier],axis=1)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)




In [4]:
train_df_origin = gpd.read_file("train.geojson", index_col=0)[:n_data_max]
test_df_origin = gpd.read_file("test.geojson", index_col=0)[:n_data_max]

In [5]:
#Keys are columns that will be augmented. Values tell us whether they are numerical or not (categorical).
cols_is_num = {'area':True,
            'length':True, 
            'area/length**2':True,
            'elongation':True, 
            'centroid_x':True,      #Irrelevant, let to see if augmentation works (knn_centroid_x should be about same value as centroid_x)
            'height':True, 
            'width':True, 
            'nb_points':True, 
            'centroid_dist':True, 
            'length/width':True, 
            
            'Dense Urban':True, 
            'Industrial':True, 
            'None':True,
            'Rural':True, 
            'Sparse Urban':True, 
            'Urban Slum':True, 
            'Barren Land':True, 
            'Coastal':True,
            'Dense Forest':True, 
            'Desert':True, 
            'Farms':True, 
            'Grass Land':True, 
            'Hills':True, 
            'Lakes':True,
            'None.1':True, 
            'River':True, 
            'Snow':True, 
            'Sparse Forest':True,
            
            'diff1':True, 
            'diff2':True, 
            'diff3':True,
            'diff4':True,
            'year_date1':True, 
            'year_date2':True, 
            'year_date3':True, 
            'year_date4':True,
            'year_date5':True,
            'coeff1':True,
            'coeff2':True,
            'coeff3':True,
            'coeff4':True,
            'power':True}

In [6]:
print(cols_is_num)

{'area': True, 'length': True, 'area/length**2': True, 'elongation': True, 'centroid_x': True, 'height': True, 'width': True, 'nb_points': True, 'centroid_dist': True, 'length/width': True, 'Dense Urban': True, 'Industrial': True, 'None': True, 'Rural': True, 'Sparse Urban': True, 'Urban Slum': True, 'Barren Land': True, 'Coastal': True, 'Dense Forest': True, 'Desert': True, 'Farms': True, 'Grass Land': True, 'Hills': True, 'Lakes': True, 'None.1': True, 'River': True, 'Snow': True, 'Sparse Forest': True, 'diff1': True, 'diff2': True, 'diff3': True, 'diff4': True, 'year_date1': True, 'year_date2': True, 'year_date3': True, 'year_date4': True, 'year_date5': True, 'coeff1': True, 'coeff2': True, 'coeff3': True, 'coeff4': True, 'power': True}


In [10]:
from sklearn.neighbors import NearestNeighbors

def find_knn(i, knn_object, df_aug):
    '''Return a dataframe composed of the k nearest neighbors in df_aug of the i-th data of df_aug.
    i: index of data
    knn_object : a NearestNeighbors object fitted on df_aug
    df_aug : dataframe augmented
    '''
    data = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
    distances, indices = knn_object.kneighbors(data.reshape(-1,2), return_distance = True)      #We return the k+1 nearest neighbors, except the first (the data point himself)
    return df_aug.iloc[indices[0, 1:]], distances[0, 1:]
    

def nearest_buildings_augmentation_mean(df_aug, k = 10, cols_is_num = cols_is_num, method = 'mean'):
    '''Return a dataframe whose columns are mean or most frequent value of k nearest neighbors features. The columns thus augmented are those in 'cols_is_num'.
    df_aug : an already augmented dataframe
    k : the number of nearest neighbors
    cols : a dictionnary whose keys are name of columns to be copied and values are True if feature is numerical, False if categorical.
    '''
    df = list()
    knn_object = NearestNeighbors(n_neighbors=k+1).fit(np.array(df_aug[['centroid_x', 'centroid_y']]))
        
    for i in tqdm(range(len(df_aug))):
        dic = dict()
        df_k_nearest_neightbors, distances = find_knn(i, knn_object, df_aug)      #Find the k nearest neighbors.
        
        if method == 'weighted':
            m = sum(distances)/k
            poids = [exp(-x/m) for x in distances]
            poids = [x/sum(poids) for x in poids]
            for col, is_num in cols_is_num.items():
                if is_num:
                    dic[f"knn_mean_{col}"] = (df_k_nearest_neightbors[col] * poids).sum()
                else:
                    raise
        elif method == 'mean':
            for col, is_num in cols_is_num.items():
                if is_num:
                    dic[f"knn_mean_{col}"] = df_k_nearest_neightbors[col].mean()              
                else:
                    raise
        df.append(dic)
    return pd.DataFrame(df)

k = 15
df = nearest_buildings_augmentation_mean(train_df, k = k, method = 'weighted')
df.to_csv('train_df_knn_mean.csv')
df.head()

  0%|          | 964/310006 [00:13<57:11, 90.07it/s]  

In [8]:
from sklearn.neighbors import NearestNeighbors

def nearest_buildings_augmentation_concatenate(df_aug, k = 1, cols = cols_is_num):
    '''Return a dataframe whose columns are features of nearest neighbors. If cols contains n feature names, this will return a n*k feature dataframe.
    df_aug : an already augmented dataframe
    k : the number of nearest neighbors
    cols : a dictionnary whose keys are name of columns to be copied
    '''
    df = list()
    data = np.array(df_aug[['centroid_x', 'centroid_y']])
    knn_object = NearestNeighbors(n_neighbors=k+1).fit(data)     #knn object to find neighbors
        
    for i in tqdm(range(len(df_aug))):
        dic = dict()
        data = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
        indices = knn_object.kneighbors(data.reshape(-1,2), return_distance = False)[0, 1:] 
        df_knn = df_aug.iloc[indices]
        
        for col in cols.keys():
            feature = df_knn[col]
            for n in range(k):
                dic[f"{n+1}th_nn_{col}"] = feature.iloc[n]                  
                
        df.append(dic)
    return pd.DataFrame(df)

df = nearest_buildings_augmentation_concatenate(train_df, k = 1)
df.to_csv('train_df_knn_concat.csv')
df.head()

100%|██████████| 310006/310006 [27:06<00:00, 190.63it/s]


Unnamed: 0,1th_nn_area,1th_nn_length,1th_nn_area/length**2,1th_nn_elongation,1th_nn_centroid_x,1th_nn_height,1th_nn_width,1th_nn_nb_points,1th_nn_centroid_dist,1th_nn_length/width,...,1th_nn_year_date1,1th_nn_year_date2,1th_nn_year_date3,1th_nn_year_date4,1th_nn_year_date5,1th_nn_coeff1,1th_nn_coeff2,1th_nn_coeff3,1th_nn_coeff4,1th_nn_power
0,4.701495e-07,0.002917,0.055257,0.001024,116.975303,0.001024,0.000603,5,0.002188,1.956201,...,2014,2015,2017,2018,2020,0.326314,0.003547,0.006136,0.242237,3.0
1,1.237159e-06,0.005545,0.040231,0.002244,116.976067,0.002244,0.000888,6,0.004642,3.847414,...,2014,2015,2017,2018,2020,0.357556,-9.2e-05,0.000336,0.137474,3.0
2,1.237159e-06,0.005545,0.040231,0.002244,116.976067,0.002244,0.000888,6,0.004642,3.847414,...,2014,2015,2017,2018,2020,0.357556,-9.2e-05,0.000336,0.137474,3.0
3,3.516945e-07,0.002763,0.046052,0.00111,116.976916,0.000547,0.00111,5,0.002232,3.034208,...,2014,2015,2017,2018,2020,0.346511,0.001723,0.00366,0.175387,3.0
4,3.655977e-07,0.002749,0.04838,0.001063,116.978161,0.000547,0.001063,5,0.002182,2.661171,...,2014,2015,2017,2018,2020,0.347925,0.000703,0.002241,0.182717,3.0


In [17]:

df = nearest_buildings_augmentation_mean(test_df, k = k,methohd='weighted')
df.to_csv('test_df_knn_mean.csv')
df.head()

100%|██████████| 121704/121704 [09:22<00:00, 216.54it/s]


Unnamed: 0,knn_mean_area,knn_mean_length,knn_mean_area/length**2,knn_mean_elongation,knn_mean_centroid_x,knn_mean_height,knn_mean_width,knn_mean_nb_points,knn_mean_centroid_dist,knn_mean_length/width,...,knn_mean_year_date1,knn_mean_year_date2,knn_mean_year_date3,knn_mean_year_date4,knn_mean_year_date5,knn_mean_coeff1,knn_mean_coeff2,knn_mean_coeff3,knn_mean_coeff4,knn_mean_power
0,1.317257e-07,0.001399,0.050097,0.00052,103.973952,0.000352,0.000511,5.133333,0.001108,2.401372,...,2014.0,2015.0,2017.0,2018.0,2020.0,0.343977,-0.004888,-0.001488,0.073261,2.133333
1,1.274085e-07,0.00136,0.050694,0.000498,103.97395,0.000349,0.000487,5.133333,0.001074,2.300134,...,2014.0,2015.0,2017.0,2018.0,2020.0,0.345149,-0.002901,-0.001724,0.059903,2.133333
2,1.375248e-07,0.001458,0.050539,0.000539,103.974156,0.000359,0.000528,5.133333,0.001157,2.318764,...,2014.0,2015.0,2017.0,2018.0,2020.0,0.34524,-0.002689,-0.001296,0.038559,2.133333
3,1.315355e-07,0.001395,0.050578,0.000512,103.973967,0.000357,0.000501,5.133333,0.001105,2.289779,...,2014.0,2015.0,2017.0,2018.0,2020.0,0.344986,-0.002959,-0.001511,0.059891,2.133333
4,1.321328e-07,0.001419,0.049144,0.000529,103.973974,0.000351,0.000518,5.133333,0.001131,2.488415,...,2014.0,2015.0,2017.0,2018.0,2020.0,0.345963,-0.003921,-0.003559,0.034687,2.133333


In [9]:
df = nearest_buildings_augmentation_concatenate(test_df, k = 1)
df.to_csv('test_df_knn_concat.csv')
df.head()

100%|██████████| 121704/121704 [08:01<00:00, 252.63it/s]


Unnamed: 0,1th_nn_area,1th_nn_length,1th_nn_area/length**2,1th_nn_elongation,1th_nn_centroid_x,1th_nn_height,1th_nn_width,1th_nn_nb_points,1th_nn_centroid_dist,1th_nn_length/width,...,1th_nn_year_date1,1th_nn_year_date2,1th_nn_year_date3,1th_nn_year_date4,1th_nn_year_date5,1th_nn_coeff1,1th_nn_coeff2,1th_nn_coeff3,1th_nn_coeff4,1th_nn_power
0,8.06871e-08,0.001467,0.037502,0.000625,103.974741,0.000275,0.000625,5,0.00125,4.434898,...,2014,2015,2017,2018,2020,0.358904,0.000971,-0.003821,-0.127318,2.0
1,4.833666e-08,0.001253,0.030788,0.000531,103.975182,0.000236,0.000531,5,0.001116,5.037163,...,2014,2015,2017,2018,2020,0.366204,-6.4e-05,-0.000221,-0.092527,2.0
2,1.425909e-07,0.001997,0.035759,0.000838,103.975004,0.000385,0.000838,5,0.001711,4.279575,...,2014,2015,2017,2018,2020,0.356471,9.2e-05,-0.000615,-0.127495,2.0
3,7.172681e-08,0.001103,0.059005,0.000362,103.974644,0.00036,0.000362,5,0.000852,1.455363,...,2014,2015,2017,2018,2020,0.34425,0.015399,0.0269,0.250742,2.0
4,8.06871e-08,0.001467,0.037502,0.000625,103.974741,0.000275,0.000625,5,0.00125,4.434898,...,2014,2015,2017,2018,2020,0.358904,0.000971,-0.003821,-0.127318,2.0
