In [120]:

import pandas as pd
import geopandas as gpd
import xgboost
import sys
from sklearn.preprocessing import LabelEncoder
import numpy as np
from math import * 
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import DBSCAN, OPTICS

import json
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 250
plt.style.use('dark_background')

from utils import plot_cluster, load_list, save_list

In [18]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [3]:
train_df_origin = gpd.read_file("train.geojson", index_col=0)
test_df_origin = gpd.read_file("test.geojson", index_col=0)

In [149]:
#Keys are columns that will be augmented. Values tell us whether they are numerical or not (categorical).
cols_is_num = {'area':True,
            'length':True, 
            'area/length**2':True,
            'elongation':True, 
            'centroid_x':True,      #Irrelevant, let to see if augmentation works (knn_centroid_x should be about same value as centroid_x)
            'height':True, 
            'width':True, 
            'nb_points':True, 
            'centroid_dist':True, 
            'length/width':True, 
            'Dense Urban':True, 
            'Industrial':True, 
            'None':True,
            'Rural':True, 
            'Sparse Urban':True, 
            'Urban Slum':True, 
            'Barren Land':True, 
            'Coastal':True,
            'Dense Forest':True, 
            'Desert':True, 
            'Farms':True, 
            'Grass Land':True, 
            'Hills':True, 
            'Lakes':True,
            'None.1':True, 
            'River':True, 
            'Snow':True, 
            'Sparse Forest':True}

In [142]:
from sklearn.neighbors import NearestNeighbors

def find_knn(i, knn_object, df_aug):
    '''Return a dataframe composed of the k nearest neighbors in df_aug of the i-th data of df_aug.
    i: index of data
    knn_object : a NearestNeighbors object fitted on df_aug
    df_aug : dataframe augmented
    '''
    arr = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
    indices = knn_object.kneighbors(arr.reshape(-1,2), return_distance = False)[0, 1:]      #We return the k+1 nearest neighbors, except the first (the data point himself)
    return df_aug.iloc[indices]
    

def nearest_buildings_augmentation_mean(df_aug, k = 10):
    '''Return a dataframe whose columns are mean or most frequent value of k nearest neighbors features. The columns thus augmented are those in 'cols_is_num'.
    df_aug : an already augmented dataframe
    k : the number of nearest neighbors
    '''
    df = list()
    knn_object = NearestNeighbors(n_neighbors=k+1).fit(np.array(df_aug[['centroid_x', 'centroid_y']]))
        
    for i in tqdm(range(len(df_aug))):
        dic = dict()
        df_k_nearest_neightbors = find_knn(i, knn_object, df_aug)      #Find the k nearest neighbors.

        for col, is_num in cols_is_num.items():
            if is_num:
                dic[f"knn_mean_{col}"] = df_k_nearest_neightbors[col].mean()                     #Return mean 
            else:
                dic[f"knn_{col}"] = df_k_nearest_neightbors[col].value_counts().argmax()    #Not sure if it works
        df.append(dic)
    return pd.DataFrame(df)

df = nearest_buildings_augmentation_mean(train_df, k = 10)
df.to_csv('train_df_knn_mean.csv')
df.head()


100%|██████████| 309736/309736 [47:00<00:00, 109.81it/s]   


In [None]:
# def nearest_buildings_augmentation_concatenate(df_aug, k = 1):
#     '''Return a dataframe whose columns are mean or most frequent value of k nearest neighbors features. The columns thus augmented are those in 'cols_is_num'.
#     df_aug : an already augmented dataframe
#     k : the number of nearest neighbors
#     '''
#     df = list()
#     knn_object = NearestNeighbors(n_neighbors=k+1).fit(np.array(df_aug[['centroid_x', 'centroid_y']]))
        
#     for i in tqdm(range(len(df_aug))):
#         dic = dict()
#         arr = np.array(df_aug[['centroid_x', 'centroid_y']].iloc[i])
#         indices = knn_object.kneighbors(arr.reshape(-1,2), return_distance = False)[0, 1:]
#         print(indices)
#         print(df_aug.iloc[indices][cols_is_num.keys()])
        

#         for col in cols_is_num.keys():
#             for n in range(1,k+1):
#                 dic[f"{n}th_nn_{col}"] = None                  #Return mean 
#                 raise
#         df.append(dic)
#     return pd.DataFrame(df)

# df = nearest_buildings_augmentation_concatenate(train_df[:100], k = 2)
# df.to_csv('train_df_knn_concat.csv')
# df.head()