In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from radio_snr import *
%matplotlib inline

In [2]:
# read data
df = pd.read_csv('wspr_sample.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,spot,timestamp,reporter,reporter_grid,snr,frequency,tx_call,tx_grid,power,drift,distance,azimuth,band,version,code
0,4541493,1093722523,1520245320,DC5AL-R,JO31lk,9,7.040113,G0NJS,IO91vs,37,0,496,91,7,,0
1,4692526,1093866340,1520259000,KA3JIJ,EM84cj,-27,10.140175,WB0KSL,EM28nu,37,0,1105,113,10,,0
2,17907988,1107180087,1521372240,PI9ESA,JO22ff,-5,10.140179,IQ6KX,JN63so,20,0,1172,328,10,,0


In [3]:
df = preprocess_data(df)
df.head()

Unnamed: 0.1,Unnamed: 0,spot,reporter,snr,frequency,tx_call,power,drift,distance,azimuth,band,version,code,rx_lat,rx_long,tx_lat,tx_long,day,hour
0,4541493,1093722523,DC5AL-R,9,7.040113,G0NJS,37,0,496,91,7,,0,51.4375,6.958333,51.770833,-0.208333,17595,10
1,4692526,1093866340,KA3JIJ,-27,10.140175,WB0KSL,37,0,1105,113,10,,0,34.395833,-83.791667,38.854167,-94.875,17595,14
2,17907988,1107180087,PI9ESA,-5,10.140179,IQ6KX,20,0,1172,328,10,,0,52.229167,4.458333,43.604167,13.541667,17608,11
3,10270295,1099476317,PA0EHG,-17,3.594176,PA7MDJ,23,0,33,350,3,,0,52.0625,4.625,51.770833,4.708333,17601,0
4,29854808,1119235300,AG5OV,-10,7.040137,WA4KFZ,37,0,2236,249,7,1.9.0-rc3,0,29.0,-100.0,38.895833,-77.458333,17621,11


In [4]:
(train,test) = train_test_split(df,train_size=0.8,test_size=0.2)

In [5]:
features_train = train[['frequency', 'drift', 'power', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
features_test = test[['frequency', 'drift', 'power', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
features_train.head()

Unnamed: 0,frequency,drift,power,distance,azimuth,band,rx_lat,rx_long,tx_lat,tx_long,day,hour
403,5.366212,1,23,437,249,5,49.520833,8.208333,51.0625,13.875,17597,15
8416,5.366263,0,30,13513,183,5,-70.6875,-8.291667,50.729167,-1.291667,17614,22
8186,7.040106,1,30,1666,216,7,43.6875,3.791667,56.604167,15.791667,17598,0
14343,7.040055,0,37,2389,290,7,44.104167,-103.291667,40.0625,-74.708333,17607,12
1072,10.140267,0,23,1321,261,10,42.0,-84.0,45.0,-68.0,17608,21


In [7]:
features = df[['frequency', 'drift', 'power', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
features.head()

Unnamed: 0,frequency,drift,power,distance,azimuth,band,rx_lat,rx_long,tx_lat,tx_long,day,hour
0,7.040113,0,37,496,91,7,51.4375,6.958333,51.770833,-0.208333,17595,10
1,10.140175,0,37,1105,113,10,34.395833,-83.791667,38.854167,-94.875,17595,14
2,10.140179,0,20,1172,328,10,52.229167,4.458333,43.604167,13.541667,17608,11
3,3.594176,0,23,33,350,3,52.0625,4.625,51.770833,4.708333,17601,0
4,7.040137,0,37,2236,249,7,29.0,-100.0,38.895833,-77.458333,17621,11


In [8]:
target = df.snr
target.head()

0     9
1   -27
2    -5
3   -17
4   -10
Name: snr, dtype: int64

In [9]:
lr = LinearRegression()

In [22]:
n = 25
err = np.zeros(n)
for k in range(n):
    df1 = df.sample(frac=1, replace=False).copy()
    features = df1[['frequency', 'drift', 'power', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
    target = df1.snr
    err[k] = -cross_val_score(lr, features, target, cv=10, scoring='neg_mean_squared_error').mean()

In [24]:
np.sqrt(err.mean())

9.121524610184592

In [27]:
n = 25
r2 = np.zeros(n)
for k in range(n):
    df1 = df.sample(frac=1, replace=False).copy()
    features = df1[['frequency', 'drift', 'power', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
    target = df1.snr
    r2[k] = cross_val_score(lr, features, target, cv=10, scoring='r2').mean()

In [28]:
r2.mean()

0.085661044374066

RMSE: 9.121524610184592

R^2: 0.085661044374066