In [41]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from radio_snr import *
%matplotlib inline

In [42]:
# read data
df = pd.read_csv('wspr_sample.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,spot,timestamp,reporter,reporter_grid,snr,frequency,tx_call,tx_grid,power,drift,distance,azimuth,band,version,code
0,4541493,1093722523,1520245320,DC5AL-R,JO31lk,9,7.040113,G0NJS,IO91vs,37,0,496,91,7,,0
1,4692526,1093866340,1520259000,KA3JIJ,EM84cj,-27,10.140175,WB0KSL,EM28nu,37,0,1105,113,10,,0
2,17907988,1107180087,1521372240,PI9ESA,JO22ff,-5,10.140179,IQ6KX,JN63so,20,0,1172,328,10,,0


In [43]:
df = preprocess_data(df)
df.head()

Unnamed: 0.1,Unnamed: 0,spot,reporter,snr,frequency,tx_call,power,drift,distance,azimuth,band,version,code,rx_lat,rx_long,tx_lat,tx_long,day,hour
0,4541493,1093722523,DC5AL-R,9,7.040113,G0NJS,37,0,496,91,7,,0,51.4375,6.958333,51.770833,-0.208333,17595,10
1,4692526,1093866340,KA3JIJ,-27,10.140175,WB0KSL,37,0,1105,113,10,,0,34.395833,-83.791667,38.854167,-94.875,17595,14
2,17907988,1107180087,PI9ESA,-5,10.140179,IQ6KX,20,0,1172,328,10,,0,52.229167,4.458333,43.604167,13.541667,17608,11
3,10270295,1099476317,PA0EHG,-17,3.594176,PA7MDJ,23,0,33,350,3,,0,52.0625,4.625,51.770833,4.708333,17601,0
4,29854808,1119235300,AG5OV,-10,7.040137,WA4KFZ,37,0,2236,249,7,1.9.0-rc3,0,29.0,-100.0,38.895833,-77.458333,17621,11


In [44]:
features = df[['frequency', 'drift', 'power', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
features = (features-features.mean())/features.std()
features.head()

Unnamed: 0,frequency,drift,power,distance,azimuth,band,rx_lat,rx_long,tx_lat,tx_long,day,hour
0,-0.07377,0.187161,0.970548,-0.577759,-0.735826,-0.058661,0.463378,0.564921,0.550277,0.422731,-1.169538,-0.341381
1,0.20201,0.187161,0.970548,-0.315188,-0.536315,0.206899,-0.611041,-1.075748,-0.360096,-1.297914,-1.169538,0.263284
2,0.20201,0.187161,-1.376072,-0.286301,1.413448,0.206899,0.51329,0.519724,-0.025314,0.672648,0.273569,-0.190215
3,-0.380319,0.187161,-0.961962,-0.777381,1.612959,-0.41274,0.502782,0.522737,0.550277,0.512095,-0.503488,-1.853044
4,-0.073768,0.187161,0.970548,0.172443,0.697024,-0.058661,-0.951229,-1.368778,-0.35716,-0.981352,1.716676,-0.190215


In [45]:
target = df.snr
target.head()

0     9
1   -27
2    -5
3   -17
4   -10
Name: snr, dtype: int64

In [46]:
lr = LinearRegression()

In [47]:
n = 25
err = np.zeros(n)
for k in range(n):
    df1 = df.sample(frac=1, replace=False).copy()
    features = df1[['frequency', 'drift', 'power', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
    target = df1.snr
    err[k] = -cross_val_score(lr, features, target, cv=10, scoring='neg_mean_squared_error').mean()

In [48]:
np.sqrt(err.mean())

9.121773449269025

In [49]:
n = 25
r2 = np.zeros(n)
for k in range(n):
    df1 = df.sample(frac=1, replace=False).copy()
    features = df1[['frequency', 'drift', 'power', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']]
    target = df1.snr
    r2[k] = cross_val_score(lr, features, target, cv=10, scoring='r2').mean()

In [50]:
r2.mean()

0.08574488719520551

RMSE: 9.121524610184592

R^2: 0.085661044374066

In [51]:
lr.fit(features,target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [52]:
lr.score(features,target)

0.08792473022430003