In [1]:

import pandas as pd
import geopandas as gpd
import xgboost
import sys
from sklearn.preprocessing import LabelEncoder
import numpy as np
from math import * 
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import DBSCAN, OPTICS

import json
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 250
plt.style.use('dark_background')

from utils import plot_cluster, load_list, save_list

In [2]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [None]:
train_df_origin = gpd.read_file("train.geojson", index_col=0)
test_df_origin = gpd.read_file("test.geojson", index_col=0)

In [4]:
#Keys are columns that will be augmented. Values tell us whether they are numerical or not (categorical).
cols_is_num = {'area':True,
            'length':True, 
            'area/length**2':True,
            'elongation':True, 
            'centroid_x':True,      #Irrelevant, let to see if augmentation works (knn_centroid_x should be about same value as centroid_x)
            'height':True, 
            'width':True, 
            'nb_points':True, 
            'centroid_dist':True, 
            'length/width':True, 
            'Dense Urban':True, 
            'Industrial':True, 
            'None':True,
            'Rural':True, 
            'Sparse Urban':True, 
            'Urban Slum':True, 
            'Barren Land':True, 
            'Coastal':True,
            'Dense Forest':True, 
            'Desert':True, 
            'Farms':True, 
            'Grass Land':True, 
            'Hills':True, 
            'Lakes':True,
            'None.1':True, 
            'River':True, 
            'Snow':True, 
            'Sparse Forest':True}

In [6]:
from sklearn.neighbors import NearestNeighbors

def find_knn(i, knn_object, df_aug):
    '''Return a dataframe composed of the k nearest neighbors in df_aug of the i-th data of df_aug.
    i: index of data
    knn_object : a NearestNeighbors object fitted on df_aug
    df_aug : dataframe augmented
    '''
    data = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
    indices = knn_object.kneighbors(data.reshape(-1,2), return_distance = False)[0, 1:]      #We return the k+1 nearest neighbors, except the first (the data point himself)
    return df_aug.iloc[indices]
    

def nearest_buildings_augmentation_mean(df_aug, k = 10, cols_is_num = cols_is_num):
    '''Return a dataframe whose columns are mean or most frequent value of k nearest neighbors features. The columns thus augmented are those in 'cols_is_num'.
    df_aug : an already augmented dataframe
    k : the number of nearest neighbors
    cols : a dictionnary whose keys are name of columns to be copied and values are True if feature is numerical, False if categorical.
    '''
    df = list()
    knn_object = NearestNeighbors(n_neighbors=k+1).fit(np.array(df_aug[['centroid_x', 'centroid_y']]))
        
    for i in tqdm(range(len(df_aug))):
        dic = dict()
        df_k_nearest_neightbors = find_knn(i, knn_object, df_aug)      #Find the k nearest neighbors.

        for col, is_num in cols_is_num.items():
            if is_num:
                dic[f"knn_mean_{col}"] = df_k_nearest_neightbors[col].mean()                     #Return mean 
            else:
                raise
        df.append(dic)
    return pd.DataFrame(df)

df = nearest_buildings_augmentation_mean(train_df, k = 10)
df.to_csv('train_df_knn_mean.csv')
df.head()


100%|██████████| 309736/309736 [29:42<00:00, 173.81it/s]


Unnamed: 0,knn_mean_area,knn_mean_length,knn_mean_area/length**2,knn_mean_elongation,knn_mean_centroid_x,knn_mean_height,knn_mean_width,knn_mean_nb_points,knn_mean_centroid_dist,knn_mean_length/width,...,knn_mean_Dense Forest,knn_mean_Desert,knn_mean_Farms,knn_mean_Grass Land,knn_mean_Hills,knn_mean_Lakes,knn_mean_None.1,knn_mean_River,knn_mean_Snow,knn_mean_Sparse Forest
0,3.277266e-07,0.002548,0.048818,0.000974,116.976885,0.000586,0.000891,5.6,0.002039,2.718245,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.8,0.0,1.0
1,4.051505e-07,0.002815,0.047251,0.001097,116.976954,0.00071,0.000921,5.6,0.002284,2.890926,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.9,0.0,1.0
2,6.534302e-07,0.003386,0.051704,0.001203,116.976414,0.000955,0.000967,5.6,0.002639,2.365985,...,0.0,0.0,0.2,0.8,0.0,0.1,0.0,0.7,0.0,0.7
3,4.498397e-07,0.003032,0.046904,0.00119,116.977219,0.000745,0.001012,5.6,0.002455,2.962178,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.9,0.0,1.0
4,4.251699e-07,0.002711,0.049871,0.001027,116.978213,0.000614,0.000963,5.4,0.002156,2.617975,...,0.0,0.0,0.3,0.9,0.0,0.0,0.0,1.0,0.0,1.0


In [7]:
df = nearest_buildings_augmentation_mean(test_df, k = 10)
df.to_csv('test_df_knn_mean.csv')
df.head()

100%|██████████| 121704/121704 [08:53<00:00, 228.18it/s]


Unnamed: 0,knn_mean_area,knn_mean_length,knn_mean_area/length**2,knn_mean_elongation,knn_mean_centroid_x,knn_mean_height,knn_mean_width,knn_mean_nb_points,knn_mean_centroid_dist,knn_mean_length/width,...,knn_mean_Dense Forest,knn_mean_Desert,knn_mean_Farms,knn_mean_Grass Land,knn_mean_Hills,knn_mean_Lakes,knn_mean_None.1,knn_mean_River,knn_mean_Snow,knn_mean_Sparse Forest
0,1.261485e-07,0.00143,0.049695,0.000541,103.974126,0.000342,0.000533,5.1,0.001151,2.613938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.146411e-07,0.001323,0.050888,0.000484,103.974145,0.000331,0.000476,5.1,0.001043,2.471598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.240665e-07,0.001398,0.051385,0.000515,103.974127,0.000346,0.000507,5.1,0.001102,2.395839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.208315e-07,0.001376,0.050714,0.000506,103.974171,0.000342,0.000497,5.1,0.001089,2.456066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.217275e-07,0.001413,0.048563,0.000532,103.974181,0.000334,0.000524,5.1,0.001129,2.754019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
def nearest_buildings_augmentation_concatenate(df_aug, k = 1, cols = cols_is_num):
    '''Return a dataframe whose columns are features of nearest neighbors. If cols contains n feature names, this will return a n*k feature dataframe.
    df_aug : an already augmented dataframe
    k : the number of nearest neighbors
    cols : a dictionnary whose keys are name of columns to be copied
    '''
    df = list()
    data = np.array(df_aug[['centroid_x', 'centroid_y']])
    knn_object = NearestNeighbors(n_neighbors=k+1).fit(data)     #knn object to find neighbors
        
    for i in tqdm(range(len(df_aug))):
        dic = dict()
        data = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
        indices = knn_object.kneighbors(data.reshape(-1,2), return_distance = False)[0, 1:] 
        df_knn = df_aug.iloc[indices]
        
        for col in cols.keys():
            feature = df_knn[col]
            for n in range(k):
                dic[f"{n+1}th_nn_{col}"] = feature.iloc[n]                  
                
        df.append(dic)
    return pd.DataFrame(df)

df = nearest_buildings_augmentation_concatenate(train_df, k = 1)
df.to_csv('train_df_knn_concat.csv')
df.head()

 34%|███▍      | 106785/309736 [08:15<3:01:04, 18.68it/s]

In [None]:
df = nearest_buildings_augmentation_concatenate(test_df, k = 1)
df.to_csv('test_df_knn_concat.csv')
df.head()