### Appending Earth Observation Data:

For each soil sample location (lat/lon in Train/Test) add mean values of each observation weighted by distance from sample location.

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
import os
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [3]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

#### KNN model:
on each file of earth observation data:

train a knn model based on location data(lon, lat) to predict earth observations.

the model is used to average observations close to each test location.

In [58]:
def fit_model(data: pd.DataFrame, target_columns: list, n_neighbors=50, weights="distance"):
    X = data[["lat", "lon"]]
    Y = data[target_columns]
    knn = KNeighborsRegressor(
        n_jobs=-1, n_neighbors=n_neighbors, weights=weights)
    return knn.fit(X, Y)

In [59]:
def add_features(knn: KNeighborsRegressor, data: pd.DataFrame, target_columns: list):
    predicted = knn.predict(data[["lat", "lon"]])
    predicted = pd.DataFrame(data=predicted, columns=target_columns)
    return pd.concat([data, predicted], axis=1)

In [None]:
def add_earth_data(path: str, train: pd.DataFrame, test: pd.DataFrame):
    df = pd.read_csv(path)
    # average observations of the same location
    aggregated = df.groupby(["lat", "lon"]).mean(
        numeric_only=True).reset_index()
    # only using numerical columns as target
    # lat lon are excluded since they represent features
    target_columns = df.select_dtypes(
        "number").columns.difference(["lat", "lon"])
    # get a knn model trained on data
    knn = fit_model(aggregated, target_columns)
    # use the knn model to add aggregated observations to train and test sets
    new_train = add_features(knn, train, target_columns)
    new_test = add_features(knn, test, target_columns)
    return new_train, new_test

In [61]:
new_train = train.copy()
new_test = test.copy()

In [62]:
_, _, filenames = next(os.walk("earth data"))

for filename in tqdm(filenames):
    full_path = os.path.join("earth data", filename)
    new_train, new_test = add_earth_data(full_path, new_train, new_test)

100%|██████████| 8/8 [00:35<00:00,  4.47s/it]


In [65]:
new_train.head()

Unnamed: 0,site,PID,lon,lat,pH,alb,bio1,bio12,bio15,bio7,bp,cec20,dows,ecec20,hp20,ls,lstd,lstn,mb1,mb2,mb3,mb7,mdem,para,parv,ph20,slope,snd20,soc20,tim,wp,xhp20,BulkDensity,N,P,K,Ca,Mg,S,Fe,Mn,Zn,Cu,B,QA_PIXEL,QA_RADSAT,SR_B1,SR_B2,SR_B3,SR_B4,SR_B5,SR_B6,SR_B7,ST_B10,Nadir_Reflectance_Band1,Nadir_Reflectance_Band2,Nadir_Reflectance_Band3,Nadir_Reflectance_Band4,sur_refl_b01,sur_refl_b02,sur_refl_b03,sur_refl_b04,sur_refl_b05,sur_refl_b06,sur_refl_b07,LST_Day_1km,LST_Night_1km,EVI,NDVI,RelativeAzimuth,SolarZenith,ViewZenith,sur_refl_b01.1,sur_refl_b02.1,sur_refl_b03.1,sur_refl_b07.1,ET,PET,VH,VV,relativeOrbitNumber_start,B1,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,B9,CLOUDY_PIXEL_PERCENTAGE,MEAN_SOLAR_ZENITH_ANGLE,NODATA_PIXEL_PERCENTAGE,SENSING_ORBIT_NUMBER
0,site_id_bIEHwl,ID_I5RGjv,70.603761,46.173798,7.75,176,248,920,108,190,0.581573,22.0,21.500278,11.00779,0.00779,0.03,44.908058,18.967873,2006.000488,3182.000732,855.000244,2363.000732,1097,20.544283,126.83548,7.05,1.962921,39.0,9.75,7.962668,0.016853,0.000708,1.46,1300,0.34,147,6830,2310,5.66,75.2,85.0,0.82,2.98,0.24,23502.396285,0.0,0.103484,0.125341,0.178989,0.221824,0.345591,0.360247,0.231166,1.079782,0.175649,0.27713,0.073566,0.125795,0.164881,0.282337,0.077741,0.125586,0.328673,0.306303,0.206862,311.756413,294.736595,0.145803,0.240233,0.492149,0.376486,0.104273,0.167284,0.266877,0.070816,0.220612,4.161121,62.746545,-14.718547,-9.103329,101.1994,565.704488,3117.167817,2568.74082,842.630403,1240.325753,1722.981748,1978.262093,2082.247405,2246.557609,2312.296706,2404.171201,2386.804356,0.000693,42.97793,0.0,135.0
1,site_id_nGvnKc,ID_8jWzJ5,70.590479,46.078924,7.1,181,250,1080,113,191,0.707011,24.0,21.389599,14.0235,0.0235,0.03,44.985626,19.730261,1637.000122,2839.000488,707.000061,2039.000488,1060,18.869566,109.835541,6.975,0.162065,40.0,8.0,8.4395,0.018321,0.001676,1.52,1400,11.7,151,1180,235,19.4,96.2,409.0,2.57,4.32,0.1,23952.685033,0.0,0.08704,0.105585,0.154613,0.196782,0.306137,0.30221,0.217227,1.10295,0.175078,0.278078,0.070546,0.119835,0.150602,0.297883,0.065749,0.113957,0.337259,0.287786,0.189664,310.862183,295.054691,0.151675,0.245489,0.379467,0.378336,0.107244,0.165992,0.271381,0.06656,0.214069,4.327898,61.16566,-14.924336,-7.25569,112.899699,563.432869,3131.473373,2572.937125,848.115291,1247.859081,1733.147803,1979.056495,2082.592956,2249.906196,2318.776269,2409.569568,2389.669003,0.000693,42.97793,0.0,135.0
2,site_id_nGvnKc,ID_UgzkN8,70.582553,46.04882,6.95,188,250,1109,111,191,0.362439,15.25,18.900057,16.062401,0.0624,0.03,44.167717,19.413284,1639.999634,2903.0,758.999939,2003.999878,1074,24.719807,214.385269,6.725,0.744845,46.0,9.25,8.289246,0.020588,0.003885,1.46,3500,21.8,151,1890,344,11.0,76.7,65.0,1.95,1.24,0.22,24043.556552,0.0,0.088952,0.10837,0.163193,0.205124,0.323409,0.306908,0.211123,1.086854,0.169608,0.297481,0.069501,0.124005,0.146294,0.299898,0.072557,0.122429,0.329579,0.275775,0.178247,310.578822,294.994168,0.191293,0.298147,0.257761,0.306571,0.11412,0.159394,0.289121,0.06584,0.207483,4.409809,60.979414,-16.580811,-9.478419,108.215299,563.471151,3134.475328,2575.014153,848.77651,1248.915717,1734.118925,1979.983578,2083.868605,2251.39686,2320.227658,2411.244651,2391.298958,0.000693,42.97793,0.0,135.0
3,site_id_nGvnKc,ID_DLLHM9,70.573267,46.02191,7.83,174,250,1149,112,191,0.531739,22.0,17.022963,18.030899,0.0309,0.03,43.281063,19.539835,1325.000122,2413.000244,631.999939,1961.0,1044,27.230274,255.713043,6.625,0.708708,43.75,10.0,8.666523,0.016913,0.001714,1.48,2300,39.9,201,6660,719,14.9,81.9,73.0,4.9,3.08,0.87,25096.652353,0.0,0.076779,0.097103,0.146538,0.180543,0.29257,0.292733,0.217778,1.102182,0.170392,0.290503,0.069566,0.12085,0.145131,0.290767,0.066968,0.112929,0.314163,0.284687,0.18585,310.750589,295.351577,0.177989,0.281924,0.397397,0.346017,0.108479,0.159408,0.280944,0.065713,0.215598,4.42487,60.917168,-16.145875,-9.228245,116.637933,556.111839,3165.061045,2604.906615,839.326546,1240.048023,1747.173415,1988.210012,2095.012023,2263.391749,2332.407245,2425.934685,2405.772316,0.000693,42.97793,0.0,135.0
4,site_id_7SA9rO,ID_d009mj,70.58533,46.204336,8.07,188,250,869,114,191,0.039202,14.75,23.103102,11.0,0.0,0.155324,45.654484,18.69072,1628.999512,2685.999023,732.999939,2427.0,1055,20.434782,86.220909,6.7,0.634153,49.25,7.0,15.139549,0.019791,0.0,1.43,940,1.0,90,7340,1160,8.66,69.4,149.0,0.55,3.03,0.31,24274.606705,0.0,0.098575,0.115409,0.171405,0.21046,0.303917,0.326277,0.250107,1.110129,0.170933,0.270334,0.072729,0.122528,0.170885,0.284847,0.083263,0.130978,0.337144,0.305781,0.224352,311.896381,294.718634,0.145673,0.237336,0.466241,0.36091,0.110338,0.161796,0.260427,0.069448,0.232596,3.571936,61.852686,-20.022234,-14.306397,110.300858,559.256845,3124.102694,2580.607567,830.440584,1228.194182,1715.826995,1972.981647,2081.568431,2245.97243,2315.278765,2405.964814,2396.027275,0.000693,42.97793,0.0,135.0


In [66]:
new_train.to_csv("train_earth.csv", index=False)
new_test.to_csv("test_earth.csv", index=False)