In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
csv_cols = [
    'spot',
    'timestamp',
    'reporter',
    'reporter_grid',
    'snr',
    'frequency',
    'tx_call',
    'tx_grid',
    'power',
    'drift',
    'distance',
    'azimuth',
    'band',
    'version',
    'code',
]
df = pd.read_csv('wspr_sample.csv', index_col=0)
# df = df.set_index('timestamp')
df.sort_values('timestamp').head()

Unnamed: 0,spot,timestamp,reporter,reporter_grid,snr,frequency,tx_call,tx_grid,power,drift,distance,azimuth,band,version,code
271,1089115381,1519862880,OE5AFP,JN78dh,-23,7.040001,F4TTR,JN18oj,33,0,818,86,7,1.2 Kiwi,0
1154,1089117275,1519863000,PI4THT,JO32kf,-13,1.8381,DL4XJ,JO42un,33,0,196,260,1,0.6,0
4127,1089124779,1519863240,DL0HT,JO43jb,4,3.594112,F5VNE,JN33,43,0,1074,7,3,,0
4018,1089123959,1519863240,DL1GCD/2,JN48ar,-9,7.040155,EA4URA,IN80ci,20,0,1319,41,7,,0
6278,1089130072,1519863480,DL1KAI,JO42vj,-3,1.838193,DK1BN/P,JO30tn,40,0,253,36,1,1.8.0,0


In [3]:
def get_long(ls):
    # Do you belive in magic?
    res = 20.0 * float(ord(ls[0]) - ord('a')) + 2.0 * float(int(ls[1])) - 180
    if len(ls) >= 3:
        res += (5.0/60.0) * float(ord(ls[2]) - ord('a') + 0.5)
    return res

def get_lat(ls):
    # More magic
    res = 10.0*(ord(ls[0])-ord('a'))+int(ls[1])-90
    if len(ls) >= 3:
        res += (1.0/60.0)*2.5*(ord(ls[2])-ord('a')+0.5)
    return res

def grid_to_coordinates(sq):
    if len(sq) == 6 or len(sq) == 4:
        long, lat= sq[::2].lower(), sq[1::2].lower()
        return (get_lat(lat), get_long(long))
    return None

In [4]:
r_coords = df['reporter_grid'].apply(grid_to_coordinates).apply(pd.Series)
df['rx_lat'] = r_coords.iloc[:,0]
df['rx_long'] = r_coords.iloc[:,1]

In [5]:
r_coords = df['tx_grid'].apply(grid_to_coordinates).apply(pd.Series)
df['tx_lat'] = r_coords.iloc[:,0]
df['tx_long'] = r_coords.iloc[:,1]

In [6]:
df = df.drop(['reporter_grid', 'tx_grid'], axis=1)

In [7]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

In [8]:
df['day'] = (df['timestamp'] - pd.Timestamp(0)).dt.days

In [9]:
df['hour'] = df['timestamp'].dt.hour

In [10]:
df = df.drop(['timestamp'], axis=1)
df.head()

Unnamed: 0,spot,reporter,snr,frequency,tx_call,power,drift,distance,azimuth,band,version,code,rx_lat,rx_long,tx_lat,tx_long,day,hour
4541493,1093722523,DC5AL-R,9,7.040113,G0NJS,37,0,496,91,7,,0,51.4375,6.958333,51.770833,-0.208333,17595,10
4692526,1093866340,KA3JIJ,-27,10.140175,WB0KSL,37,0,1105,113,10,,0,34.395833,-83.791667,38.854167,-94.875,17595,14
17907988,1107180087,PI9ESA,-5,10.140179,IQ6KX,20,0,1172,328,10,,0,52.229167,4.458333,43.604167,13.541667,17608,11
10270295,1099476317,PA0EHG,-17,3.594176,PA7MDJ,23,0,33,350,3,,0,52.0625,4.625,51.770833,4.708333,17601,0
29854808,1119235300,AG5OV,-10,7.040137,WA4KFZ,37,0,2236,249,7,1.9.0-rc3,0,29.0,-100.0,38.895833,-77.458333,17621,11


In [11]:
# Combine all this processing into one function
def preprocess_data(df):
    r_coords = df['reporter_grid'].apply(grid_to_coordinates).apply(pd.Series)
    df['rx_lat'] = r_coords.iloc[:,0]
    df['rx_long'] = r_coords.iloc[:,1]
    t_coords = df['tx_grid'].apply(grid_to_coordinates).apply(pd.Series)
    df['tx_lat'] = t_coords.iloc[:,0]
    df['tx_long'] = t_coords.iloc[:,1]
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df['day'] = (df['timestamp'] - pd.Timestamp(0)).dt.days
    df['hour'] = df['timestamp'].dt.hour
    df = df.drop(['reporter_grid', 'tx_grid', 'timestamp'], axis=1)
    return df

In [12]:
test_df = pd.read_csv('wspr_sample.csv', index_col=0)
test_df = preprocess_data(test_df)
test_df.head()

Unnamed: 0,spot,reporter,snr,frequency,tx_call,power,drift,distance,azimuth,band,version,code,rx_lat,rx_long,tx_lat,tx_long,day,hour
4541493,1093722523,DC5AL-R,9,7.040113,G0NJS,37,0,496,91,7,,0,51.4375,6.958333,51.770833,-0.208333,17595,10
4692526,1093866340,KA3JIJ,-27,10.140175,WB0KSL,37,0,1105,113,10,,0,34.395833,-83.791667,38.854167,-94.875,17595,14
17907988,1107180087,PI9ESA,-5,10.140179,IQ6KX,20,0,1172,328,10,,0,52.229167,4.458333,43.604167,13.541667,17608,11
10270295,1099476317,PA0EHG,-17,3.594176,PA7MDJ,23,0,33,350,3,,0,52.0625,4.625,51.770833,4.708333,17601,0
29854808,1119235300,AG5OV,-10,7.040137,WA4KFZ,37,0,2236,249,7,1.9.0-rc3,0,29.0,-100.0,38.895833,-77.458333,17621,11


In [13]:
mean_snr = test_df['snr'].mean()
mean_snr

-14.984384226756774

In [21]:
# find a baseline RMSE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [24]:
train_err = []
test_err = []
for i in range(100):
    (train, test) = train_test_split(test_df, train_size=0.8, test_size=0.2)
    train_target = test_df['snr']
    train_MSE = mean_squared_error(train_target, np.full(train_target.shape, train_target.mean()))
    test_target = test_df['snr']
    test_MSE = mean_squared_error(test_target, np.full(test_target.shape, test_target.mean()))
    train_err.append(train_MSE)
    test_err.append(test_MSE)
err = pd.DataFrame()
err['train_MSE'] = train_err
err['test_MSE'] = test_MSE
print("Baseline Train MSE: %f" % err['train_MSE'].mean())
print("Baseline Test MSE: %f" % err['test_MSE'].mean())

Baseline Train MSE: 91.067534
Baseline Test MSE: 91.067534
