In [1]:

import pandas as pd
import geopandas as gpd
import xgboost
import sys
from sklearn.preprocessing import LabelEncoder
import numpy as np
from math import * 
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import DBSCAN, OPTICS

import json
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 250
plt.style.use('dark_background')

from utils import plot_cluster, load_list, save_list

In [2]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [3]:
train_df_origin = gpd.read_file("train.geojson", index_col=0)
test_df_origin = gpd.read_file("test.geojson", index_col=0)

In [4]:
#Keys are columns that will be augmented. Values tell us whether they are numerical or not (categorical).
cols_is_num = {'area':True,
            'length':True, 
            'area/length**2':True,
            'elongation':True, 
            'centroid_x':True,      #Irrelevant, let to see if augmentation works (knn_centroid_x should be about same value as centroid_x)
            'height':True, 
            'width':True, 
            'nb_points':True, 
            'centroid_dist':True, 
            'length/width':True, 
            'Dense Urban':True, 
            'Industrial':True, 
            'None':True,
            'Rural':True, 
            'Sparse Urban':True, 
            'Urban Slum':True, 
            'Barren Land':True, 
            'Coastal':True,
            'Dense Forest':True, 
            'Desert':True, 
            'Farms':True, 
            'Grass Land':True, 
            'Hills':True, 
            'Lakes':True,
            'None.1':True, 
            'River':True, 
            'Snow':True, 
            'Sparse Forest':True}

In [5]:
from sklearn.neighbors import NearestNeighbors

def find_knn(i, knn_object, df_aug):
    '''Return a dataframe composed of the k nearest neighbors in df_aug of the i-th data of df_aug.
    i: index of data
    knn_object : a NearestNeighbors object fitted on df_aug
    df_aug : dataframe augmented
    '''
    data = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
    indices = knn_object.kneighbors(data.reshape(-1,2), return_distance = False)[0, 1:]      #We return the k+1 nearest neighbors, except the first (the data point himself)
    return df_aug.iloc[indices]
    

def nearest_buildings_augmentation_mean(df_aug, k = 10, cols_is_num = cols_is_num):
    '''Return a dataframe whose columns are mean or most frequent value of k nearest neighbors features. The columns thus augmented are those in 'cols_is_num'.
    df_aug : an already augmented dataframe
    k : the number of nearest neighbors
    cols : a dictionnary whose keys are name of columns to be copied and values are True if feature is numerical, False if categorical.
    '''
    df = list()
    knn_object = NearestNeighbors(n_neighbors=k+1).fit(np.array(df_aug[['centroid_x', 'centroid_y']]))
        
    for i in tqdm(range(len(df_aug))):
        dic = dict()
        df_k_nearest_neightbors = find_knn(i, knn_object, df_aug)      #Find the k nearest neighbors.

        for col, is_num in cols_is_num.items():
            if is_num:
                dic[f"knn_mean_{col}"] = df_k_nearest_neightbors[col].mean()                     #Return mean 
            else:
                dic[f"knn_{col}"] = df_k_nearest_neightbors[col].value_counts().argmax()    #Not sure if it works
        df.append(dic)
    return pd.DataFrame(df)

df = nearest_buildings_augmentation_mean(train_df, k = 50)
df.to_csv('train_df_knn_mean.csv')
df.head()


  1%|▏         | 4485/309736 [00:23<25:24, 200.24it/s]

In [None]:
def nearest_buildings_augmentation_concatenate(df_aug, k = 1, cols = cols_is_num):
    '''Return a dataframe whose columns are features of nearest neighbors. If cols contains n feature names, this will return a n*k feature dataframe.
    df_aug : an already augmented dataframe
    k : the number of nearest neighbors
    cols : a dictionnary whose keys are name of columns to be copied
    '''
    df = list()
    data = np.array(df_aug[['centroid_x', 'centroid_y']])
    knn_object = NearestNeighbors(n_neighbors=k+1).fit(data)     #knn object to find neighbors
        
    for i in tqdm(range(len(df_aug))):
        dic = dict()
        data = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
        indices = knn_object.kneighbors(data.reshape(-1,2), return_distance = False)[0, 1:] 
        df_knn = df_aug.iloc[indices]
        
        for col in cols.keys():
            feature = df_knn[col]
            for n in range(k):
                dic[f"{n+1}th_nn_{col}"] = feature.iloc[n]                  
                
        df.append(dic)
    return pd.DataFrame(df)

df = nearest_buildings_augmentation_concatenate(train_df, k = 3)
df.to_csv('train_df_knn_concat.csv')
df.head()

100%|██████████| 309736/309736 [35:16<00:00, 146.37it/s]   


Unnamed: 0,1th_nn_area,2th_nn_area,3th_nn_area,1th_nn_length,2th_nn_length,3th_nn_length,1th_nn_area/length**2,2th_nn_area/length**2,3th_nn_area/length**2,1th_nn_elongation,...,3th_nn_None.1,1th_nn_River,2th_nn_River,3th_nn_River,1th_nn_Snow,2th_nn_Snow,3th_nn_Snow,1th_nn_Sparse Forest,2th_nn_Sparse Forest,3th_nn_Sparse Forest
0,4.701495e-07,3.475792e-07,3.585905e-07,0.002917,0.002729,0.002763,0.055257,0.046674,0.04697,0.001024,...,0,0,1,1,0,0,0,1,1,1
1,1.237159e-06,4.536881e-07,3.516945e-07,0.005545,0.002875,0.002763,0.040231,0.054875,0.046052,0.002244,...,0,1,0,1,0,0,0,1,1,1
2,1.237159e-06,4.701495e-07,1.494868e-07,0.005545,0.002917,0.001682,0.040231,0.055257,0.052813,0.002244,...,0,1,0,1,0,0,0,1,1,1
3,3.516945e-07,3.475792e-07,2.905206e-07,0.002763,0.002729,0.002606,0.046052,0.046674,0.042784,0.00111,...,0,1,1,1,0,0,0,1,1,1
4,3.655977e-07,3.58368e-07,7.963734e-08,0.002749,0.002704,0.001148,0.04838,0.049023,0.060472,0.001063,...,0,1,1,1,0,0,0,1,1,1
