In [119]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [120]:
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.neighbors

#Reading training and test data
train_df_org = pd.read_csv("train_final_v2.csv")
test_df_org = pd.read_csv("test_final_v2.csv")
submission_org = pd.read_csv("test_masked_v2.csv")
submission_org = pd.read_csv("test_masked_v2.csv")
train_df_org['urban_bi'] = train_df_org['urban'].map({'R': 0, 'U': 1})
test_df_org['urban_bi'] = test_df_org['urban'].map({'R': 0, 'U': 1})
submission_org['urban_bi'] = submission_org['urban'].map({'R': 0, 'U': 1})
feature_cols = [
    'BLUE', 'RED', 'NIR', 'THERMAL','PANCH',
    'lat', 'lon', 'asset_index', 'urban_bi'
]

label_cols = ['water_index']

#Cleaning the data and replacing Nans with the median of that column
for c in feature_cols:
    test_df_org[c] = test_df_org[c].replace(np.NaN, test_df_org[c].median())
    train_df_org[c] = train_df_org[c].replace(np.NaN, train_df_org[c].median())
    submission_org[c] = submission_org[c].replace(np.NaN, submission_org[c].median())
    
train_df = train_df_org[(feature_cols + label_cols)]
test_df = test_df_org[(feature_cols + label_cols)]

#Seperating X, y for train/test and submission
X_train = train_df[feature_cols]
y_train = train_df[label_cols]
X_test = test_df[feature_cols]
y_test = test_df[label_cols]
X_sub = submission_org[feature_cols]


#Merging train+test to re-split them and try different ratios
X_total = pd.concat((X_test, X_train))
y_total = pd.concat((y_test, y_train))

test_ratio = 0.2

X_train, X_test, y_train, y_test = train_test_split(X_total, y_total, test_size=test_ratio, random_state=42)

#Scaling the X values
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Trying different configurations:
"""
Config 1: use scaled X + set weights to distance
Config 2: use scaled X + set weights to uniform
Config 3: use non-scaled X + set weights to distance
Config 4: use non-scaled X + set weights to uniform
"""

min_mse = 1
config = ""

for k in range(5, 50):
    knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='auto')
    knn.fit(X_train_scaled, y_train)
    preds = knn.predict(X_test_scaled)
    mse = sklearn.metrics.mean_squared_error(y_test, preds)
    print(f'k={k} - distance - {mse}')
    
    if mse < min_mse:
        min_mse = mse
        config = f'k={k} - distance - {mse}'
    
    knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k, weights='uniform', algorithm='auto')
    knn.fit(X_train_scaled, y_train)
    preds = knn.predict(X_test_scaled)
    mse = sklearn.metrics.mean_squared_error(y_test, preds)
    print(f'k={k} - uniform - {mse}')
    
    if mse < min_mse:
        min_mse = mse
        config = f'k={k} - uniform - {mse}'
    
    knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='auto')
    knn.fit(X_train, y_train)
    preds = knn.predict(X_test)
    mse = sklearn.metrics.mean_squared_error(y_test, preds)
    print(f'k={k} - NS - distance - {mse}')
    
    if mse < min_mse:
        min_mse = mse
        config = f'k={k} - NS - distance - {mse}'
    
    knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k, weights='uniform', algorithm='auto')
    knn.fit(X_train, y_train)
    preds = knn.predict(X_test)
    mse = sklearn.metrics.mean_squared_error(y_test, preds)
    print(f'k={k} - NS - uniform - {mse}')
    
    if mse < min_mse:
        min_mse = mse
        config = f'k={k} - NS - uniform - {mse}'

k=5 - distance - 0.28815685140253455
k=5 - uniform - 0.2938289534043634
k=5 - NS - distance - 0.22953972409083162
k=5 - NS - uniform - 0.23172818041426152
k=6 - distance - 0.2864069920583599
k=6 - uniform - 0.2933459169119626
k=6 - NS - distance - 0.22468100656524662
k=6 - NS - uniform - 0.2273008830878213
k=7 - distance - 0.28252992474088956
k=7 - uniform - 0.2897413184367966
k=7 - NS - distance - 0.22284310819697406
k=7 - NS - uniform - 0.22670003425121973
k=8 - distance - 0.28114200372902964
k=8 - uniform - 0.28908148760406505
k=8 - NS - distance - 0.21854090397062362
k=8 - NS - uniform - 0.22238439984994485
k=9 - distance - 0.27973525783140685
k=9 - uniform - 0.28801674666671795
k=9 - NS - distance - 0.2178200174744992
k=9 - NS - uniform - 0.22220543119730127
k=10 - distance - 0.2795706225067337
k=10 - uniform - 0.28753687482074297
k=10 - NS - distance - 0.2174875939957364
k=10 - NS - uniform - 0.22220588518827683
k=11 - distance - 0.27789143230812924
k=11 - uniform - 0.28560089937

In [121]:
min_mse

0.21633387219125957

In [125]:
"""
After trying different configurations, the optimal(lowest MSE) configuration was:
k=14 - Not Scaled - weights=distance - MSE=0.22897112891895058
"""

#Now fit knn using X_total+y_total(test and train combined) for best k
k = 14
knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='auto')
knn.fit(X_total, y_total)
preds = knn.predict(X_test)
mse = sklearn.metrics.mean_squared_error(y_test, preds)

#Predicting and building submission file
X_sub = submission_org[feature_cols]
solution_df = submission_org.copy()
solution_df["water_index"] = knn.predict(X_sub)
filtered_solution_df = solution_df[["DHSID_EA", "water_index"]]

In [126]:
filtered_solution_df

Unnamed: 0,DHSID_EA,water_index
0,IA-2015-7-00010009,5.000000
1,IA-2015-7-00010011,4.949983
2,IA-2015-7-00010017,4.644996
3,IA-2015-7-00010044,4.895529
4,IA-2015-7-00010060,5.000000
...,...,...
2670,IA-2015-7-00360403,4.966319
2671,IA-2015-7-00360454,4.941686
2672,IA-2015-7-00360474,4.835530
2673,IA-2015-7-00360476,4.657228


In [124]:
filtered_solution_df.to_csv("solution_21.csv")