In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from radio_snr import *
%matplotlib inline

In [2]:
# read data
df = pd.read_csv('wspr_sample.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,spot,timestamp,reporter,reporter_grid,snr,frequency,tx_call,tx_grid,power,drift,distance,azimuth,band,version,code
0,4541493,1093722523,1520245320,DC5AL-R,JO31lk,9,7.040113,G0NJS,IO91vs,37,0,496,91,7,,0
1,4692526,1093866340,1520259000,KA3JIJ,EM84cj,-27,10.140175,WB0KSL,EM28nu,37,0,1105,113,10,,0
2,17907988,1107180087,1521372240,PI9ESA,JO22ff,-5,10.140179,IQ6KX,JN63so,20,0,1172,328,10,,0


In [3]:
df = preprocess_data(df)
df.head()

Unnamed: 0.1,Unnamed: 0,spot,reporter,snr,frequency,tx_call,power,drift,distance,azimuth,band,version,code,rx_lat,rx_long,tx_lat,tx_long,day,hour
0,4541493,1093722523,DC5AL-R,9,7.040113,G0NJS,37,0,496,91,7,,0,51.4375,6.958333,51.770833,-0.208333,17595,10
1,4692526,1093866340,KA3JIJ,-27,10.140175,WB0KSL,37,0,1105,113,10,,0,34.395833,-83.791667,38.854167,-94.875,17595,14
2,17907988,1107180087,PI9ESA,-5,10.140179,IQ6KX,20,0,1172,328,10,,0,52.229167,4.458333,43.604167,13.541667,17608,11
3,10270295,1099476317,PA0EHG,-17,3.594176,PA7MDJ,23,0,33,350,3,,0,52.0625,4.625,51.770833,4.708333,17601,0
4,29854808,1119235300,AG5OV,-10,7.040137,WA4KFZ,37,0,2236,249,7,1.9.0-rc3,0,29.0,-100.0,38.895833,-77.458333,17621,11


In [5]:
def Polynomial(feature_names,target_name,df,num_splits, degree):
    rmse = np.zeros(num_splits)
    r2 = np.zeros(num_splits)
    for i in range(num_splits):
        features = df[feature_names]
        target = df[target_name]
        
        if len(feature_names) == 1:
            features = features.values.reshape(-1,1)
        
        poly = PolynomialFeatures(degree,interaction_only = False)
        features_engineered = poly.fit_transform(features)
        col = poly.get_feature_names(feature_names)
        features_engineered = pd.DataFrame(features_engineered,columns=col)
        lr = LinearRegression(fit_intercept=False)
        rmse[i] = np.sqrt(-cross_val_score(lr, features_engineered, target, cv=10, scoring='neg_mean_squared_error').mean())
        r2[i] = cross_val_score(lr, features_engineered, target, cv=10, scoring='r2').mean()
        
        #features_engineered_test = poly.fit_transform(features_test)
        #col = poly.get_feature_names(feature_names)
        #features_engineered_test = pd.DataFrame(features_engineered_test,columns=col)
        
        #yh_train = lr.predict(features_engineered)
        #yh_test = lr.predict(features_engineered_test)
        #rmse_train[i] = np.sqrt(mean_squared_error(target_train, yh_train))
        #rmse_test[i] = np.sqrt(mean_squared_error(target_test, yh_test))    
        #r2_test[i] = lr.score(features_engineered_test,target_test)
    return (rmse.mean(), r2.mean())

In [6]:
rmse = np.zeros(3)
r2_test = np.zeros(3)
for i in range(1,4):
    print('i: ',i)
    (rmse[i-1],r2_test[i-1]) = Polynomial(df[['frequency', 'drift', 'power', 'distance', 'azimuth', 'band', 'rx_lat', 'rx_long', 'tx_lat', 'tx_long', 'day', 'hour']].columns, df.iloc[:,[3]].columns, df, 10, i)

i:  1
i:  2
i:  3


In [7]:
rmse

array([ 9.12188006,  9.06341906, 41.60757546])

In [9]:
r2_test

array([  0.0854291 ,   0.09694456, -17.8426143 ])

Best Degree: 2

Best RMSE: 9.06341906

Best R^2: 0.09694456