In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale, PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor

from radio_snr import *

In [3]:
df = pd.read_csv('wspr_sample.csv', index_col=0)
df.head()

Unnamed: 0,spot,timestamp,reporter,reporter_grid,snr,frequency,tx_call,tx_grid,power,drift,distance,azimuth,band,version,code
4541493,1093722523,1520245320,DC5AL-R,JO31lk,9,7.040113,G0NJS,IO91vs,37,0,496,91,7,,0
4692526,1093866340,1520259000,KA3JIJ,EM84cj,-27,10.140175,WB0KSL,EM28nu,37,0,1105,113,10,,0
17907988,1107180087,1521372240,PI9ESA,JO22ff,-5,10.140179,IQ6KX,JN63so,20,0,1172,328,10,,0
10270295,1099476317,1520727720,PA0EHG,JO22hb,-17,3.594176,PA7MDJ,JO21is,23,0,33,350,3,,0
29854808,1119235300,1522494360,AG5OV,EL09,-10,7.040137,WA4KFZ,FM18gv,37,0,2236,249,7,1.9.0-rc3,0


In [4]:
df = preprocess_data(df)
df.head()

Unnamed: 0,spot,reporter,snr,frequency,tx_call,power,drift,distance,azimuth,band,version,code,rx_lat,rx_long,tx_lat,tx_long,day,hour
4541493,1093722523,DC5AL-R,9,7.040113,G0NJS,37,0,496,91,7,,0,51.4375,6.958333,51.770833,-0.208333,17595,10
4692526,1093866340,KA3JIJ,-27,10.140175,WB0KSL,37,0,1105,113,10,,0,34.395833,-83.791667,38.854167,-94.875,17595,14
17907988,1107180087,PI9ESA,-5,10.140179,IQ6KX,20,0,1172,328,10,,0,52.229167,4.458333,43.604167,13.541667,17608,11
10270295,1099476317,PA0EHG,-17,3.594176,PA7MDJ,23,0,33,350,3,,0,52.0625,4.625,51.770833,4.708333,17601,0
29854808,1119235300,AG5OV,-10,7.040137,WA4KFZ,37,0,2236,249,7,1.9.0-rc3,0,29.0,-100.0,38.895833,-77.458333,17621,11


In [86]:
targets = df['snr']
features = df[['power', 'frequency', 'drift', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
features.head()

Unnamed: 0,power,frequency,drift,distance,azimuth,band,rx_lat,rx_long,tx_lat,tx_long,day,hour
4541493,37,7.040113,0,496,91,7,51.4375,6.958333,51.770833,-0.208333,17595,10
4692526,37,10.140175,0,1105,113,10,34.395833,-83.791667,38.854167,-94.875,17595,14
17907988,20,10.140179,0,1172,328,10,52.229167,4.458333,43.604167,13.541667,17608,11
10270295,23,3.594176,0,33,350,3,52.0625,4.625,51.770833,4.708333,17601,0
29854808,37,7.040137,0,2236,249,7,29.0,-100.0,38.895833,-77.458333,17621,11


In [87]:
features.loc[:,:] = scale(features)
features.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,power,frequency,drift,distance,azimuth,band,rx_lat,rx_long,tx_lat,tx_long,day,hour
count,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0,15241.0
mean,-1.409687e-16,-5.608298000000001e-17,-6.134454e-16,-1.0984950000000001e-17,-5.812991e-17,-2.380194e-16,8.549759000000001e-17,8.884115000000001e-17,-6.110998e-16,4.401265e-17,1.635514e-13,-9.894104000000001e-17
std,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033
min,-4.136937,-0.6999444,-5.526641,-0.7916351,-1.561126,-0.7668448,-7.23642,-2.416686,-9.019212,-2.430953,-1.613624,-1.853105
25%,-0.9619939,-0.2226883,0.1871674,-0.4988743,-0.9353673,-0.2357083,-0.2984445,-1.009491,-0.3160563,-1.027582,-0.8365406,-0.7949063
50%,0.004293119,-0.07377322,0.1871674,-0.3035568,-0.06474661,-0.0586628,0.2926363,0.5091945,0.2213749,0.419715,-0.05945738,0.1121212
75%,0.9705801,0.2020174,0.1871674,0.04482426,0.9872533,0.2069054,0.5133065,0.6146588,0.5356107,0.5863322,0.8286377,0.8679775
max,3.7314,41.55722,5.900976,7.265107,1.703702,41.36998,1.574625,3.584985,2.539965,3.593015,1.716733,1.623834


In [30]:
mse = []
r2 = []
max_n = 100

for n in range(1, max_n):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n)
    tmp_mse = np.sqrt(-1*cross_val_score(knn, features, targets, scoring='neg_mean_squared_error').mean())
    mse.append(tmp_mse)
    knn.fit(features, targets)
    tmp_r2 = knn.score(features, targets)
    r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n))
err['test_rmse'] = mse
err['r^2'] = r2
err.sort_values('test_rmse')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Unnamed: 0,num_neighbors,test_rmse,r^2
66,67,9.151579,0.115026
67,68,9.151620,0.114770
65,66,9.151727,0.115392
63,64,9.151919,0.116245
62,63,9.151957,0.116778
68,69,9.151961,0.114252
61,62,9.151965,0.117143
69,70,9.152620,0.113973
39,40,9.152756,0.134626
64,65,9.152921,0.115726


In [32]:
dist_mse = []
dist_r2 = []
max_n = 100
for n in range(1, max_n):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n, n_jobs=-1, weights='distance')
    tmp_mse = np.sqrt(-1*cross_val_score(knn, features, targets, scoring='neg_mean_squared_error').mean())
    dist_mse.append(tmp_mse)
    knn.fit(features, targets)
    tmp_r2 = knn.score(features, targets)
    dist_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n))
err['test_rmse'] = dist_mse
err['r^2'] = dist_r2
err.sort_values('test_rmse')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Unnamed: 0,num_neighbors,test_rmse,r^2
61,62,9.107438,0.999914
62,63,9.107650,0.999914
63,64,9.107689,0.999914
65,66,9.107711,0.999914
66,67,9.107766,0.999914
67,68,9.107830,0.999914
39,40,9.108141,0.999914
49,50,9.108150,0.999914
68,69,9.108188,0.999914
60,61,9.108411,0.999914


In [70]:
dist_sq_mse = []
dist_sq_r2 = []
max_n = 100
step = 5
for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n, weights=lambda x: 1.0/np.power(x+0.0001, 2))
    tmp_mse = np.sqrt(-1*cross_val_score(knn, features, targets, scoring='neg_mean_squared_error').mean())
    dist_sq_mse.append(tmp_mse)
    knn.fit(features, targets)
    tmp_r2 = knn.score(features, targets)
    dist_sq_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = dist_sq_mse
err['r^2'] = dist_sq_r2
err.sort_values('test_rmse')

1
6
11
16
21
26
31
36
41
46
51
56
61
66
71
76
81
86
91
96


Unnamed: 0,num_neighbors,test_rmse,r^2
14,71,9.079061,0.999914
17,86,9.079363,0.999914
16,81,9.079411,0.999914
13,66,9.079896,0.999914
15,76,9.080299,0.999914
19,96,9.080321,0.999914
18,91,9.080447,0.999914
12,61,9.082759,0.999914
11,56,9.087751,0.999914
10,51,9.089681,0.999914


In [98]:
from itertools import combinations
feature_names = ['power', 'frequency', 'drift', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']
best = pd.DataFrame(columns=['features', 'n_neighbors', 'test_rmse', 'r^2'])
pos = 0
for feat_num in range(1, 2):#len(feature_names)):
    for combo in combinations(feature_names, feat_num):
        max_n = 100
        step = 10
        tmp_mse = []
        tmp_r2 = []
        tmp_features = features[list(combo)]
        for n in range(1, max_n, step):
            knn = KNeighborsRegressor(n_neighbors=n, weights=lambda x: 1.0/np.power(x+0.0001, 2))
            tmp_mse.append(np.sqrt(-1*cross_val_score(knn, tmp_features, targets, scoring='neg_mean_squared_error').mean()))
            knn.fit(tmp_features, targets)
            tmp_r2.append(knn.score(tmp_features, targets))
        nparr = np.array(tmp_mse)
        maxidx = nparr.argmin()
        min_mse = nparr[maxidx]
        min_r2 = np.array(tmp_r2)[maxidx]
        min_n = list(range(1, max_n, step))[maxidx]
        best.loc[pos] = [combo, min_n, min_mse, min_r2]
        if pos<100 or pos % 100 == 0:
            print(combo, min_n, min_mse, min_r2)
        pos += 1
best.sort_values('test_rmse')

('power',) 91 9.478758085680653 0.02741525783678922
('frequency',) 71 9.432895514861354 0.056196173809713756
('drift',) 91 9.5498503933035 -0.011419753212440709
('distance',) 91 10.137755615401218 0.29568356556785036
('azimuth',) 61 9.68183069853553 0.03515747213593334
('band',) 71 9.522317362260113 0.000682746196057793
('rx_lat',) 91 9.58042232017845 0.09110042190079104
('rx_long',) 91 9.595534064159391 0.09294348562114818
('tx_lat',) 81 9.431848087194066 0.11088077779498273
('tx_long',) 91 9.487960764938306 0.11973300472404824
('day',) 91 9.603247334400981 -0.007707235760880726
('hour',) 91 9.58935608572616 -0.007857877167433136


Unnamed: 0,features,n_neighbors,test_rmse,r^2
8,"(tx_lat,)",81,9.431848,0.110881
1,"(frequency,)",71,9.432896,0.056196
0,"(power,)",91,9.478758,0.027415
9,"(tx_long,)",91,9.487961,0.119733
5,"(band,)",71,9.522317,0.000683
2,"(drift,)",91,9.54985,-0.01142
6,"(rx_lat,)",91,9.580422,0.0911
11,"(hour,)",91,9.589356,-0.007858
7,"(rx_long,)",91,9.595534,0.092943
10,"(day,)",91,9.603247,-0.007707


In [97]:
dist_sq_mse = []
dist_sq_r2 = []
max_n = 100
step = 1

select_feats = features[['tx_lat', 'frequency', 'power', 'tx_long']]
for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n, weights=lambda x: 1.0/np.power(x+0.0001, 2))
    tmp_mse = np.sqrt(-1*cross_val_score(knn, select_feats, targets, scoring='neg_mean_squared_error').mean())
    dist_sq_mse.append(tmp_mse)
    knn.fit(select_feats, targets)
    tmp_r2 = knn.score(select_feats, targets)
    dist_sq_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = dist_sq_mse
err['r^2'] = dist_sq_r2
err.sort_values('test_rmse')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Unnamed: 0,num_neighbors,test_rmse,r^2
98,99,9.629329,0.316352
97,98,9.629613,0.316854
96,97,9.629653,0.316551
95,96,9.629745,0.316752
94,95,9.629940,0.316803
93,94,9.630201,0.316680
92,93,9.630679,0.316902
59,60,9.630701,0.319606
90,91,9.630919,0.317003
87,88,9.630921,0.317176


In [101]:
dist_sq_mse = []
dist_sq_r2 = []
max_n = 100
step = 1

select_feats = features[['tx_lat', 'tx_long', 'distance', 'frequency']]
for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n, weights='distance')#lambda x: 1.0/np.power(x+0.0001, 2))
    tmp_mse = np.sqrt(-1*cross_val_score(knn, select_feats, targets, scoring='neg_mean_squared_error').mean())
    dist_sq_mse.append(tmp_mse)
    knn.fit(select_feats, targets)
    tmp_r2 = knn.score(select_feats, targets)
    dist_sq_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = dist_sq_mse
err['r^2'] = dist_sq_r2
err.sort_values('test_rmse')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Unnamed: 0,num_neighbors,test_rmse,r^2
98,99,9.227409,0.972175
97,98,9.227534,0.972175
96,97,9.227830,0.972175
95,96,9.228389,0.972175
94,95,9.228795,0.972175
92,93,9.229775,0.972175
93,94,9.229815,0.972175
91,92,9.230507,0.972175
90,91,9.230880,0.972175
88,89,9.230889,0.972175


In [104]:
dist_sq_mse = []
dist_sq_r2 = []
max_n = 100
step = 1

select_feats = features[['tx_lat', 'tx_long', 'distance', 'frequency']]
log_targets = np.exp(targets)
for n in range(1, max_n, step):
    print(n)
    knn = KNeighborsRegressor(n_neighbors=n, weights='distance')#lambda x: 1.0/np.power(x+0.0001, 2))
    tmp_mse = np.sqrt(-1*cross_val_score(knn, select_feats, log_targets, scoring='neg_mean_squared_error').mean())
    dist_sq_mse.append(tmp_mse)
    knn.fit(select_feats, log_targets)
    tmp_r2 = knn.score(select_feats, log_targets)
    dist_sq_r2.append(tmp_r2)
    
err = pd.DataFrame()
err['num_neighbors'] = list(range(1,max_n, step))
err['test_rmse'] = dist_sq_mse
err['r^2'] = dist_sq_r2
err.sort_values('test_rmse')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Unnamed: 0,num_neighbors,test_rmse,r^2
0,1,3.834376e+16,1.0
98,99,4.190542e+16,1.0
97,98,4.190923e+16,1.0
96,97,4.191313e+16,1.0
95,96,4.191586e+16,1.0
94,95,4.191978e+16,1.0
93,94,4.192388e+16,1.0
92,93,4.192683e+16,1.0
91,92,4.193075e+16,1.0
90,91,4.193509e+16,1.0
