In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale, PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor

from radio_snr import *

In [7]:
df = pd.read_csv('wspr_sample.csv', index_col=0)
df.head()

Unnamed: 0,spot,timestamp,reporter,reporter_grid,snr,frequency,tx_call,tx_grid,power,drift,distance,azimuth,band,version,code
4541493,1093722523,1520245320,DC5AL-R,JO31lk,9,7.040113,G0NJS,IO91vs,37,0,496,91,7,,0
4692526,1093866340,1520259000,KA3JIJ,EM84cj,-27,10.140175,WB0KSL,EM28nu,37,0,1105,113,10,,0
17907988,1107180087,1521372240,PI9ESA,JO22ff,-5,10.140179,IQ6KX,JN63so,20,0,1172,328,10,,0
10270295,1099476317,1520727720,PA0EHG,JO22hb,-17,3.594176,PA7MDJ,JO21is,23,0,33,350,3,,0
29854808,1119235300,1522494360,AG5OV,EL09,-10,7.040137,WA4KFZ,FM18gv,37,0,2236,249,7,1.9.0-rc3,0


In [8]:
df = preprocess_data(df)
df.head()

Unnamed: 0,spot,reporter,snr,frequency,tx_call,power,drift,distance,azimuth,band,version,code,rx_lat,rx_long,tx_lat,tx_long,day,hour
4541493,1093722523,DC5AL-R,9,7.040113,G0NJS,37,0,496,91,7,,0,51.4375,6.958333,51.770833,-0.208333,17595,10
4692526,1093866340,KA3JIJ,-27,10.140175,WB0KSL,37,0,1105,113,10,,0,34.395833,-83.791667,38.854167,-94.875,17595,14
17907988,1107180087,PI9ESA,-5,10.140179,IQ6KX,20,0,1172,328,10,,0,52.229167,4.458333,43.604167,13.541667,17608,11
10270295,1099476317,PA0EHG,-17,3.594176,PA7MDJ,23,0,33,350,3,,0,52.0625,4.625,51.770833,4.708333,17601,0
29854808,1119235300,AG5OV,-10,7.040137,WA4KFZ,37,0,2236,249,7,1.9.0-rc3,0,29.0,-100.0,38.895833,-77.458333,17621,11


In [9]:
targets = df['snr']
features = df[['power', 'frequency', 'drift', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
features.head()

Unnamed: 0,power,frequency,drift,distance,azimuth,band,rx_lat,rx_long,tx_lat,tx_long,day,hour
4541493,37,7.040113,0,496,91,7,51.4375,6.958333,51.770833,-0.208333,17595,10
4692526,37,10.140175,0,1105,113,10,34.395833,-83.791667,38.854167,-94.875,17595,14
17907988,20,10.140179,0,1172,328,10,52.229167,4.458333,43.604167,13.541667,17608,11
10270295,23,3.594176,0,33,350,3,52.0625,4.625,51.770833,4.708333,17601,0
29854808,37,7.040137,0,2236,249,7,29.0,-100.0,38.895833,-77.458333,17621,11


In [5]:
# features.loc[:,:] = scale(features)
features = (features - features.mean())/features.std()
features.describe().round(5)

Unnamed: 0,power,frequency,drift,distance,azimuth,band,rx_lat,rx_long,tx_lat,tx_long,day,hour
count,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0
mean,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-4.1368,-0.69992,-5.52646,-0.79161,-1.56107,-0.76682,-7.23618,-2.41661,-9.01892,-2.43087,-1.61357,-1.85304
25%,-0.96196,-0.22268,0.18716,-0.49886,-0.93534,-0.2357,-0.29843,-1.00946,-0.31605,-1.02755,-0.83651,-0.79488
50%,0.00429,-0.07377,0.18716,-0.30355,-0.06474,-0.05866,0.29263,0.50918,0.22137,0.4197,-0.05946,0.11212
75%,0.97055,0.20201,0.18716,0.04482,0.98722,0.2069,0.51329,0.61464,0.53559,0.58631,0.82861,0.86795
max,3.73128,41.55586,5.90078,7.26487,1.70365,41.36862,1.57457,3.58487,2.53988,3.5929,1.71668,1.62378


In [6]:
mse = []
r2 = []
max_n = 100

for n in range(1, max_n):
    if n % 10 == 0:
        print(n)
    knn = KNeighborsRegressor(n_neighbors=n)
    tmp_mse = np.sqrt(-1*cross_val_score(knn, features, targets, scoring='neg_mean_squared_error').mean())
    mse.append(tmp_mse)
    knn.fit(features, targets)
    tmp_r2 = knn.score(features, targets)
    r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n))
err['test_rmse'] = mse
err['r^2'] = r2
err.sort_values('test_rmse').head()

10
20
30
40
50
60
70
80
90


Unnamed: 0,num_neighbors,test_rmse,r^2
52,53,9.13537,0.124779
53,54,9.136414,0.124768
54,55,9.136869,0.123761
51,52,9.136992,0.125185
49,50,9.137162,0.126633


In [7]:
dist_mse = []
dist_r2 = []
max_n = 100
for n in range(1, max_n):
    if n % 10 == 0:
        print(n)
    knn = KNeighborsRegressor(n_neighbors=n, n_jobs=-1, weights='distance')
    tmp_mse = np.sqrt(-1*cross_val_score(knn, features, targets, scoring='neg_mean_squared_error').mean())
    dist_mse.append(tmp_mse)
    knn.fit(features, targets)
    tmp_r2 = knn.score(features, targets)
    dist_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n))
err['test_rmse'] = dist_mse
err['r^2'] = dist_r2
err.sort_values('test_rmse').head()

10
20
30
40
50
60
70
80
90


Unnamed: 0,num_neighbors,test_rmse,r^2
52,53,9.088586,0.999957
53,54,9.089461,0.999957
49,50,9.089666,0.999957
51,52,9.089725,0.999957
54,55,9.090295,0.999957


In [8]:
dist_sq_mse = []
dist_sq_r2 = []
max_n = 100
step = 5
for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n, weights=lambda x: 1.0/np.power(x+0.0001, 2))
    tmp_mse = np.sqrt(-1*cross_val_score(knn, features, targets, scoring='neg_mean_squared_error').mean())
    dist_sq_mse.append(tmp_mse)
    knn.fit(features, targets)
    tmp_r2 = knn.score(features, targets)
    dist_sq_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = dist_sq_mse
err['r^2'] = dist_sq_r2
err.sort_values('test_rmse').head()

1
6
11
16
21
26
31
36
41
46
51
56
61
66
71
76
81
86
91
96


Unnamed: 0,num_neighbors,test_rmse,r^2
17,86,9.064719,0.999915
18,91,9.065375,0.999915
19,96,9.065831,0.999915
16,81,9.065974,0.999915
14,71,9.066815,0.999915


In [9]:
from itertools import combinations
feature_names = ['power', 'frequency', 'drift', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']
best = pd.DataFrame(columns=['features', 'n_neighbors', 'test_rmse', 'r^2'])
pos = 0
for feat_num in range(1, 3):#len(feature_names)):
    for combo in combinations(feature_names, feat_num):
        max_n = 100
        step = 10
        tmp_mse = []
        tmp_r2 = []
        tmp_features = features[list(combo)]
        for n in range(1, max_n, step):
            knn = KNeighborsRegressor(n_neighbors=n, weights=lambda x: 1.0/np.power(x+0.0001, 2))
            tmp_mse.append(np.sqrt(-1*cross_val_score(knn, tmp_features, targets, scoring='neg_mean_squared_error').mean()))
            knn.fit(tmp_features, targets)
            tmp_r2.append(knn.score(tmp_features, targets))
        nparr = np.array(tmp_mse)
        maxidx = nparr.argmin()
        min_mse = nparr[maxidx]
        min_r2 = np.array(tmp_r2)[maxidx]
        min_n = list(range(1, max_n, step))[maxidx]
        best.loc[pos] = [combo, min_n, min_mse, min_r2]
        if pos<100 or pos % 100 == 0:
            print(combo, min_n, min_mse, min_r2)
        pos += 1
best.sort_values('test_rmse')

('power',) 91 9.478758085667753 0.02741525783678922
('frequency',) 71 9.432845388510598 0.05608333968583168
('drift',) 91 9.549850393303366 -0.011419753212440709
('distance',) 91 10.137734221714776 0.29568270411380104
('azimuth',) 61 9.68183088128469 0.03515747207891906
('band',) 71 9.522317362259901 0.000682746196057793
('rx_lat',) 91 9.58041098195236 0.0911003556545571
('rx_long',) 91 9.595529892518856 0.0929434893206037
('tx_lat',) 81 9.431884757703061 0.11088072822942174
('tx_long',) 91 9.488028988579407 0.1197346474700134
('day',) 91 9.603247334400981 -0.007707235760880726
('hour',) 91 9.58935608572616 -0.007857877167433136
('power', 'frequency') 51 9.271541703826328 0.11131505396974328
('power', 'drift') 91 9.472638692660158 0.03219489241052986
('power', 'distance') 91 10.428985209485298 0.6433292989745983
('power', 'azimuth') 91 10.257550417502603 0.2388357354473889
('power', 'band') 91 9.346475810636367 0.061575198068507746
('power', 'rx_lat') 91 9.981975018836332 0.27666142789

Unnamed: 0,features,n_neighbors,test_rmse,r^2
12,"(power, frequency)",51,9.271542,0.111315
16,"(power, band)",91,9.346476,0.061575
8,"(tx_lat,)",81,9.431885,0.110881
26,"(frequency, band)",71,9.432237,0.055610
1,"(frequency,)",71,9.432845,0.056083
23,"(frequency, drift)",81,9.440790,0.060633
13,"(power, drift)",91,9.472639,0.032195
0,"(power,)",91,9.478758,0.027415
72,"(tx_lat, tx_long)",91,9.483893,0.216197
9,"(tx_long,)",91,9.488029,0.119735


In [10]:
dist_sq_mse = []
dist_sq_r2 = []
max_n = 100
step = 1

select_feats = features[['tx_lat', 'frequency', 'power', 'tx_long']]
for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n, weights=lambda x: 1.0/np.power(x+0.0001, 2))
    tmp_mse = np.sqrt(-1*cross_val_score(knn, select_feats, targets, scoring='neg_mean_squared_error').mean())
    dist_sq_mse.append(tmp_mse)
    knn.fit(select_feats, targets)
    tmp_r2 = knn.score(select_feats, targets)
    dist_sq_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = dist_sq_mse
err['r^2'] = dist_sq_r2
err.sort_values('test_rmse')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Unnamed: 0,num_neighbors,test_rmse,r^2
98,99,9.629360,0.316406
97,98,9.629659,0.316954
96,97,9.629685,0.316635
95,96,9.629827,0.316827
94,95,9.629952,0.316892
93,94,9.630237,0.316818
59,60,9.630467,0.319521
92,93,9.630694,0.317056
60,61,9.630878,0.319480
85,86,9.630884,0.317657


In [11]:
dist_sq_mse = []
dist_sq_r2 = []
max_n = 100
step = 5

select_feats = features[['tx_lat', 'tx_long', 'distance', 'frequency']]
for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n, weights=lambda x: 1.0/np.power(x+0.0001, 2))
    tmp_mse = np.sqrt(-1*cross_val_score(knn, select_feats, targets, scoring='neg_mean_squared_error').mean())
    dist_sq_mse.append(tmp_mse)
    knn.fit(select_feats, targets)
    tmp_r2 = knn.score(select_feats, targets)
    dist_sq_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = dist_sq_mse
err['r^2'] = dist_sq_r2
err.sort_values('test_rmse')

1
6
11
16
21
26
31
36
41
46
51
56
61
66
71
76
81
86
91
96


Unnamed: 0,num_neighbors,test_rmse,r^2
19,96,9.561921,0.902099
18,91,9.56588,0.902099
17,86,9.569608,0.902099
16,81,9.573312,0.902099
15,76,9.577797,0.902099
14,71,9.583213,0.902099
13,66,9.587326,0.902099
12,61,9.592763,0.902099
11,56,9.59931,0.902099
10,51,9.60685,0.902099


In [12]:
dist_sq_mse = []
dist_sq_r2 = []
max_n = 100
step = 1

select_feats = features[['tx_lat', 'tx_long', 'distance', 'frequency']]
log_targets = np.exp(targets)
for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n, weights='distance')#lambda x: 1.0/np.power(x+0.0001, 2))
    tmp_mse = np.sqrt(-1*cross_val_score(knn, select_feats, log_targets, scoring='neg_mean_squared_error').mean())
    dist_sq_mse.append(tmp_mse)
    knn.fit(select_feats, log_targets)
    tmp_r2 = knn.score(select_feats, log_targets)
    dist_sq_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = dist_sq_mse
err['r^2'] = dist_sq_r2
err.sort_values('test_rmse')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Unnamed: 0,num_neighbors,test_rmse,r^2
0,1,3.834376e+16,1.0
98,99,4.190542e+16,1.0
97,98,4.190923e+16,1.0
96,97,4.191313e+16,1.0
95,96,4.191586e+16,1.0
94,95,4.191978e+16,1.0
93,94,4.192388e+16,1.0
92,93,4.192683e+16,1.0
91,92,4.193075e+16,1.0
90,91,4.193509e+16,1.0


In [16]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
dims = (100, 100) # n_neighbors, n_estimators
steps = (5, 50)

ensemble_mse = pd.DataFrame()
ensemble_r2 = pd.DataFrame()
for n_neighbors in range(steps[0], dims[0], steps[0]):
    knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=lambda x: 1.0/np.power(x+0.0001, 2))
    test_rmse = []
    r2 = []
    for n_estimators in range(steps[1], dims[1], steps[1]):
        print((n_neighbors, n_estimators))
        boost = AdaBoostRegressor(base_estimator=knn, n_estimators=n_estimators)
        test_rmse.append(np.sqrt(-1 * cross_val_score(boost, features, targets, scoring='neg_mean_squared_error').mean()))
        print(test_rmse[len(test_rmse)-1])
        boost.fit(features, targets)
        r2.append(boost.score(features, targets))
    ensemble_mse[n_neighbors] = test_rmse
    ensemble_r2[n_neighbors] = r2
ensemble_mse

(5, 50)
10.303455976839789
(10, 50)
9.728717997261686
(15, 50)
9.480199488541022
(20, 50)
9.388813353602131
(25, 50)
9.31429736806715
(30, 50)
9.273559617082222
(35, 50)
9.234801447121647
(40, 50)
9.223316924017615
(45, 50)
9.197911961875253
(50, 50)
9.18737162211913
(55, 50)
9.194979537582983
(60, 50)
9.1787072961684
(65, 50)
9.181251423749174
(70, 50)
9.170268432817005


In [None]:
from sklearn.ensemble import BaggingRegressor
dims = (100, 100) # n_neighbors, n_estimators
steps = (5, 50)

ensemble_mse = pd.DataFrame()
ensemble_r2 = pd.DataFrame()
for n_neighbors in range(steps[0], dims[0], steps[0]):
    knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=lambda x: 1.0/np.power(x+0.0001, 2))
    test_rmse = []
    r2 = []
    for n_estimators in range(steps[1], dims[1], steps[1]):
        print((n_neighbors, n_estimators))
        boost = BaggingRegressor(base_estimator=knn, n_estimators=n_estimators)
        test_rmse.append(np.sqrt(-1 * cross_val_score(boost, features, targets, scoring='neg_mean_squared_error').mean()))
        print(test_rmse[len(test_rmse)-1])
        boost.fit(features, targets)
        r2.append(boost.score(features, targets))
    ensemble_mse[n_neighbors] = test_rmse
    ensemble_r2[n_neighbors] = r2
ensemble_mse

In [None]:
boost_large_mse = pd.DataFrame()
boost_large_r2 = pd.DataFrame()

knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=lambda x: 1.0/np.power(x+0.0001, 2))
test_rmse = []
r2 = []
for n_estimators in range(steps[1], dims[1], steps[1]):
    print((n_neighbors, n_estimators))
    boost = AdaBoostRegressor(base_estimator=knn, n_estimators=500)
    test_rmse.append(np.sqrt(-1 * cross_val_score(boost, features, targets, scoring='neg_mean_squared_error').mean()))
    print(test_rmse[len(test_rmse)-1])
    boost.fit(features, targets)
    r2.append(boost.score(features, targets))
boost_large_mse[n_neighbors] = test_rmse
boost_large_r2[n_neighbors] = r2

In [None]:
bagging_large_mse = pd.DataFrame()
bagging_large_r2 = pd.DataFrame()

knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=lambda x: 1.0/np.power(x+0.0001, 2))
test_rmse = []
r2 = []
for n_estimators in range(steps[1], dims[1], steps[1]):
    print((n_neighbors, n_estimators))
    boost = BaggingRegressor(base_estimator=knn, n_estimators=500)
    test_rmse.append(np.sqrt(-1 * cross_val_score(boost, features, targets, scoring='neg_mean_squared_error').mean()))
    print(test_rmse[len(test_rmse)-1])
    boost.fit(features, targets)
    r2.append(boost.score(features, targets))
bagging_large_mse[n_neighbors] = test_rmse
bagging_large_r2[n_neighbors] = r2

## Testing with PCA

In [3]:
from sklearn.decomposition import PCA

In [10]:
feats_std = (features - features.mean())/features.std()
feats_std.describe().round(5)

Unnamed: 0,power,frequency,drift,distance,azimuth,band,rx_lat,rx_long,tx_lat,tx_long,day,hour
count,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0
mean,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-4.1368,-0.69992,-5.52646,-0.79161,-1.56107,-0.76682,-7.23618,-2.41661,-9.01892,-2.43087,-1.61357,-1.85304
25%,-0.96196,-0.22268,0.18716,-0.49886,-0.93534,-0.2357,-0.29843,-1.00946,-0.31605,-1.02755,-0.83651,-0.79488
50%,0.00429,-0.07377,0.18716,-0.30355,-0.06474,-0.05866,0.29263,0.50918,0.22137,0.4197,-0.05946,0.11212
75%,0.97055,0.20201,0.18716,0.04482,0.98722,0.2069,0.51329,0.61464,0.53559,0.58631,0.82861,0.86795
max,3.73128,41.55586,5.90078,7.26487,1.70365,41.36862,1.57457,3.58487,2.53988,3.5929,1.71668,1.62378


In [14]:
pca = PCA(n_components=4)
pca.fit(feats_std)
pca_feats = pca.fit_transform(feats_std)
pca_feats = pd.DataFrame(pca_feats, columns=["PC"+str(i) for i in range(pca_feats.shape[1])])
pca_feats.head()

Unnamed: 0,PC0,PC1,PC2,PC3
0,-0.769986,0.628255,-0.014707,0.454797
1,1.002528,-1.175098,0.885207,0.508071
2,-0.425838,1.033223,-0.017978,-0.975244
3,-1.36827,0.418646,0.093124,-1.05126
4,1.11331,-1.732891,0.427119,-0.817783


In [15]:
pca_mse = []
pca_r2 = []
max_n = 100
step = 5
for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n) #weights=lambda x: 1.0/np.power(x+0.0001, 2))
    tmp_mse = np.sqrt(-1*cross_val_score(knn, pca_feats, targets, scoring='neg_mean_squared_error').mean())
    pca_mse.append(tmp_mse)
    knn.fit(features, targets)
    tmp_r2 = knn.score(features, targets)
    pca_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = pca_mse
err['r^2'] = pca_r2
err.sort_values('test_rmse').head()

1
6
11
16
21
26
31
36
41
46
51
56
61
66
71
76
81
86
91
96


Unnamed: 0,num_neighbors,test_rmse,r^2
8,41,9.251347,0.101079
7,36,9.254017,0.105466
10,51,9.25603,0.092385
14,71,9.257373,0.081702
9,46,9.25747,0.095185


In [16]:
# location_feats = df[['rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'distance', 'power']]
location_feats = df[['rx_lat', 'tx_lat', 'distance', 'power', 'frequency']]
location_feats = (location_feats - location_feats.mean())/location_feats.std()
# location_feats["band"] = location_feats["band"] * 1.48
targets = df['snr']

In [17]:
location_feats.describe()

Unnamed: 0,rx_lat,tx_lat,distance,power,frequency
count,15241.0,15241.0,15241.0,15241.0,15241.0
mean,2.261074e-14,1.957769e-15,-1.2776930000000001e-17,-7.639676e-16,3.670197e-16
std,1.0,1.0,1.0,1.0,1.0
min,-7.236183,-9.018916,-0.7916091,-4.136801,-0.6999214
25%,-0.2984347,-0.3160459,-0.498858,-0.9619623,-0.222681
50%,0.2926267,0.2213677,-0.3035468,0.004292979,-0.0737708
75%,0.5132897,0.5355931,0.04482279,0.9705483,0.2020107
max,1.574573,2.539881,7.264869,3.731278,41.55586


In [18]:
loc_mse = []
loc_r2 = []
loc_test_r2 = []
max_n = 100
step = 1

for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n)
    tmp_mse = np.sqrt(-1*cross_val_score(knn, location_feats, targets, scoring='neg_mean_squared_error').mean())
    loc_mse.append(tmp_mse)
    tmp_test_r2 = cross_val_score(knn, location_feats, targets, scoring='r2').mean()
    loc_test_r2.append(tmp_test_r2)
    knn.fit(location_feats, targets)
    tmp_r2 = knn.score(location_feats, targets)
    loc_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = loc_mse
err['train r^2'] = loc_r2
err['test r^2'] = loc_test_r2
err.sort_values('test_rmse')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Unnamed: 0,num_neighbors,test_rmse,train r^2,test r^2
22,23,8.916277,0.210824,0.126572
24,25,8.916791,0.206499,0.126476
25,26,8.916979,0.202886,0.126442
23,24,8.918217,0.208687,0.126204
20,21,8.918425,0.217442,0.126177
39,40,8.919244,0.179471,0.126092
30,31,8.919581,0.192803,0.125983
21,22,8.919627,0.213145,0.125916
31,32,8.919650,0.190448,0.125975
35,36,8.920372,0.185252,0.125867
