In [1]:

import pandas as pd
import geopandas as gpd
import xgboost
import sys
from sklearn.preprocessing import LabelEncoder
import numpy as np
from math import * 
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import DBSCAN, OPTICS

import json
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 250
plt.style.use('dark_background')

from utils import plot_cluster, load_list, save_list

In [2]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [3]:
train_df_origin = gpd.read_file("train.geojson", index_col=0)
test_df_origin = gpd.read_file("test.geojson", index_col=0)

In [4]:
#Keys are columns that will be augmented. Values tell us whether they are numerical or not (categorical).
cols_is_num = {'area':True,
            'length':True, 
            'area/length**2':True,
            'elongation':True, 
            'centroid_x':True,      #Irrelevant, let to see if augmentation works (knn_centroid_x should be about same value as centroid_x)
            'height':True, 
            'width':True, 
            'nb_points':True, 
            'centroid_dist':True, 
            'length/width':True, 
            
            'Dense Urban':True, 
            'Industrial':True, 
            'None':True,
            'Rural':True, 
            'Sparse Urban':True, 
            'Urban Slum':True, 
            'Barren Land':True, 
            'Coastal':True,
            'Dense Forest':True, 
            'Desert':True, 
            'Farms':True, 
            'Grass Land':True, 
            'Hills':True, 
            'Lakes':True,
            'None.1':True, 
            'River':True, 
            'Snow':True, 
            'Sparse Forest':True,
            
            'diff1':True, 
            'diff2':True, 
            'diff3':True,
            'diff4':True,
            'year_date1':True, 
            'year_date2':True, 
            'year_date3':True, 
            'year_date4':True,
            'year_date5':True,
            'season_date1':True, 
            'season_date2':True, 
            'season_date3':True, 
            'season_date4':True,
            'season_date5':True,
            }

In [53]:
from sklearn.neighbors import NearestNeighbors

def find_knn(i, knn_object, df_aug):
    '''Return a dataframe composed of the k nearest neighbors in df_aug of the i-th data of df_aug.
    i: index of data
    knn_object : a NearestNeighbors object fitted on df_aug
    df_aug : dataframe augmented
    '''
    data = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
    distances, indices = knn_object.kneighbors(data.reshape(-1,2), return_distance = True)      #We return the k+1 nearest neighbors, except the first (the data point himself)
    return df_aug.iloc[indices[0, 1:]], distances[0, 1:]
    

def nearest_buildings_augmentation_mean(df_aug, k = 10, cols_is_num = cols_is_num, method = 'weighted'):
    '''Return a dataframe whose columns are mean or most frequent value of k nearest neighbors features. The columns thus augmented are those in 'cols_is_num'.
    df_aug : an already augmented dataframe
    k : the number of nearest neighbors
    cols : a dictionnary whose keys are name of columns to be copied and values are True if feature is numerical, False if categorical.
    '''
    df = list()
    knn_object = NearestNeighbors(n_neighbors=k+1).fit(np.array(df_aug[['centroid_x', 'centroid_y']]))
        
    for i in tqdm(range(len(df_aug))):
        dic = dict()
        df_k_nearest_neightbors, distances = find_knn(i, knn_object, df_aug)      #Find the k nearest neighbors.
        
        if method == 'weighted':
            m = sum(distances)/k
            poids = [exp(-x/m) for x in distances]
            poids = [x/sum(poids) for x in poids]
            for col, is_num in cols_is_num.items():
                if is_num:
                    dic[f"knn_mean_{col}"] = (df_k_nearest_neightbors[col] * poids).sum()
                else:
                    raise
        elif method == 'mean':
            for col, is_num in cols_is_num.items():
                if is_num:
                    dic[f"knn_mean_{col}"] = df_k_nearest_neightbors[col].mean()              
                else:
                    raise
        df.append(dic)
    return pd.DataFrame(df)

df = nearest_buildings_augmentation_mean(train_df, k = 15, method = 'mean')
df.to_csv('train_df_knn_mean.csv')
df.head()


100%|██████████| 309736/309736 [40:56<00:00, 126.07it/s] 


Unnamed: 0,knn_mean_area,knn_mean_length,knn_mean_area/length**2,knn_mean_elongation,knn_mean_centroid_x,knn_mean_height,knn_mean_width,knn_mean_nb_points,knn_mean_centroid_dist,knn_mean_length/width,...,knn_mean_year_date1,knn_mean_year_date2,knn_mean_year_date3,knn_mean_year_date4,knn_mean_year_date5,knn_mean_season_date1,knn_mean_season_date2,knn_mean_season_date3,knn_mean_season_date4,knn_mean_season_date5
0,3.713732e-07,0.00268,0.048954,0.001006,116.977251,0.000597,0.000948,5.533333,0.002136,2.698564,...,2014.0,2015.0,2017.0,2018.0,2020.0,3.0,3.0,3.0,3.0,3.0
1,4.237372e-07,0.002857,0.047974,0.001093,116.977149,0.000684,0.000971,5.6,0.0023,2.821322,...,2014.0,2015.0,2017.0,2018.0,2020.0,3.0,3.0,3.0,3.0,3.0
2,5.336697e-07,0.003084,0.049906,0.001128,116.976918,0.000796,0.000972,5.6,0.002434,2.577486,...,2014.0,2015.0,2017.0,2018.0,2020.0,3.0,3.0,3.0,3.0,3.0
3,3.76632e-07,0.002692,0.048654,0.001038,116.97733,0.000669,0.000888,5.4,0.00217,2.722069,...,2014.0,2015.0,2017.0,2018.0,2020.0,3.0,3.0,3.0,3.0,3.0
4,4.326081e-07,0.002768,0.048894,0.001063,116.97793,0.000677,0.00093,5.466667,0.002223,2.69558,...,2014.0,2015.0,2017.0,2018.0,2020.0,3.0,3.0,3.0,3.0,3.0


In [8]:
def nearest_buildings_augmentation_concatenate(df_aug, k = 1, cols = cols_is_num):
    '''Return a dataframe whose columns are features of nearest neighbors. If cols contains n feature names, this will return a n*k feature dataframe.
    df_aug : an already augmented dataframe
    k : the number of nearest neighbors
    cols : a dictionnary whose keys are name of columns to be copied
    '''
    df = list()
    data = np.array(df_aug[['centroid_x', 'centroid_y']])
    knn_object = NearestNeighbors(n_neighbors=k+1).fit(data)     #knn object to find neighbors
        
    for i in tqdm(range(len(df_aug))):
        dic = dict()
        data = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
        indices = knn_object.kneighbors(data.reshape(-1,2), return_distance = False)[0, 1:] 
        df_knn = df_aug.iloc[indices]
        
        for col in cols.keys():
            feature = df_knn[col]
            for n in range(k):
                dic[f"{n+1}th_nn_{col}"] = feature.iloc[n]                  
                
        df.append(dic)
    return pd.DataFrame(df)

df = nearest_buildings_augmentation_concatenate(train_df, k = 1)
df.to_csv('train_df_knn_concat.csv')
df.head()

100%|██████████| 309736/309736 [29:26<00:00, 175.34it/s] 


Unnamed: 0,1th_nn_area,1th_nn_length,1th_nn_area/length**2,1th_nn_elongation,1th_nn_centroid_x,1th_nn_height,1th_nn_width,1th_nn_nb_points,1th_nn_centroid_dist,1th_nn_length/width,...,1th_nn_Dense Forest,1th_nn_Desert,1th_nn_Farms,1th_nn_Grass Land,1th_nn_Hills,1th_nn_Lakes,1th_nn_None.1,1th_nn_River,1th_nn_Snow,1th_nn_Sparse Forest
0,4.701495e-07,0.002917,0.055257,0.001024,116.975303,0.001024,0.000603,5,0.002188,1.956201,...,0,0,0,1,0,0,0,0,0,1
1,1.237159e-06,0.005545,0.040231,0.002244,116.976067,0.002244,0.000888,6,0.004642,3.847414,...,0,0,0,1,0,0,0,1,0,1
2,1.237159e-06,0.005545,0.040231,0.002244,116.976067,0.002244,0.000888,6,0.004642,3.847414,...,0,0,0,1,0,0,0,1,0,1
3,3.516945e-07,0.002763,0.046052,0.00111,116.976916,0.000547,0.00111,5,0.002232,3.034208,...,0,0,0,1,0,0,0,1,0,1
4,3.655977e-07,0.002749,0.04838,0.001063,116.978161,0.000547,0.001063,5,0.002182,2.661171,...,0,0,0,1,0,0,0,1,0,1


In [7]:
df = nearest_buildings_augmentation_mean(test_df, k = 10)
df.to_csv('test_df_knn_mean.csv')
df.head()

100%|██████████| 121704/121704 [08:53<00:00, 228.18it/s]


Unnamed: 0,knn_mean_area,knn_mean_length,knn_mean_area/length**2,knn_mean_elongation,knn_mean_centroid_x,knn_mean_height,knn_mean_width,knn_mean_nb_points,knn_mean_centroid_dist,knn_mean_length/width,...,knn_mean_Dense Forest,knn_mean_Desert,knn_mean_Farms,knn_mean_Grass Land,knn_mean_Hills,knn_mean_Lakes,knn_mean_None.1,knn_mean_River,knn_mean_Snow,knn_mean_Sparse Forest
0,1.261485e-07,0.00143,0.049695,0.000541,103.974126,0.000342,0.000533,5.1,0.001151,2.613938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.146411e-07,0.001323,0.050888,0.000484,103.974145,0.000331,0.000476,5.1,0.001043,2.471598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.240665e-07,0.001398,0.051385,0.000515,103.974127,0.000346,0.000507,5.1,0.001102,2.395839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.208315e-07,0.001376,0.050714,0.000506,103.974171,0.000342,0.000497,5.1,0.001089,2.456066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.217275e-07,0.001413,0.048563,0.000532,103.974181,0.000334,0.000524,5.1,0.001129,2.754019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df = nearest_buildings_augmentation_concatenate(test_df, k = 1)
df.to_csv('test_df_knn_concat.csv')
df.head()

100%|██████████| 121704/121704 [06:55<00:00, 293.23it/s]


Unnamed: 0,1th_nn_area,1th_nn_length,1th_nn_area/length**2,1th_nn_elongation,1th_nn_centroid_x,1th_nn_height,1th_nn_width,1th_nn_nb_points,1th_nn_centroid_dist,1th_nn_length/width,...,1th_nn_Dense Forest,1th_nn_Desert,1th_nn_Farms,1th_nn_Grass Land,1th_nn_Hills,1th_nn_Lakes,1th_nn_None.1,1th_nn_River,1th_nn_Snow,1th_nn_Sparse Forest
0,8.06871e-08,0.001467,0.037502,0.000625,103.974741,0.000275,0.000625,5,0.00125,4.434898,...,0,0,0,0,0,0,0,0,0,0
1,4.833666e-08,0.001253,0.030788,0.000531,103.975182,0.000236,0.000531,5,0.001116,5.037163,...,0,0,0,0,0,0,0,0,0,0
2,1.425909e-07,0.001997,0.035759,0.000838,103.975004,0.000385,0.000838,5,0.001711,4.279575,...,0,0,0,0,0,0,0,0,0,0
3,7.172681e-08,0.001103,0.059005,0.000362,103.974644,0.00036,0.000362,5,0.000852,1.455363,...,0,0,0,0,0,0,0,0,0,0
4,8.06871e-08,0.001467,0.037502,0.000625,103.974741,0.000275,0.000625,5,0.00125,4.434898,...,0,0,0,0,0,0,0,0,0,0
