# Leave-One-Device-Out and Publish

## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import vincenty
from sklearn.ensemble import ExtraTreesRegressor
import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


In [2]:
df_train = pd.read_csv('/Users/maelfabien/Desktop/LocalDB/RSSI/df_mess_train_3.csv')
df_test = pd.read_csv('/Users/maelfabien/Desktop/LocalDB/RSSI/df_mess_test_3.csv')

In [3]:
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_test = df_test.drop(['Unnamed: 0'], axis=1)

## Error Functions

In [4]:
def vincenty_vec(vec_coord):
    vin_vec_dist = np.zeros(vec_coord.shape[0])
    if vec_coord.shape[1] !=  4:
        print('ERROR: Bad number of columns (shall be = 4)')
    else:
        vin_vec_dist = [vincenty(vec_coord[m,0:2],vec_coord[m,2:]).meters for m in range(vec_coord.shape[0])]
    return vin_vec_dist

In [5]:
# evaluate distance error for each predicted point
def Eval_geoloc(y_train_lat , y_train_lng, y_pred_lat, y_pred_lng):
    y_pred_lat[np.where(y_pred_lat>90)[0]]=90
    y_pred_lat[np.where(y_pred_lat<-90)[0]]=-90
    vec_coord = np.array([np.array(y_train_lat) , np.array(y_train_lng), y_pred_lat, y_pred_lng])
    err_vec = vincenty_vec(np.transpose(vec_coord))
    
    return err_vec

## k-Fold Cross Validation

In [6]:
df_train['did'].unique()

array([ 473335.,  473953.,  476512.,  476286.,  473438.,  476185.,
        476285.,  476314.,  476306.,  476317.,  476320.,  473502.,
        476318.,  476197.,  476312.,  476316.,  476251.,  476308.,
        473683.,  473796.,  476323.,  476329.,  476321.,  476324.,
        476332.,  476322.,  473864.,  473512.,  473805.,  476327.,
        476325.,  476515.,  476505.,  476517.,  476212.,  476503.,
        476507.,  476611.,  476610.,  476606.,  476604.,  476607.,
        476602.,  476609.,  476600.,  476598.,  476615.,  476521.,
        476525.,  476523.,  476830.,  474181.,  476828.,  474126.,
        476826.,  476853.,  476315.,  474176.,  476852.,  476987.,
        476307.,  476868.,  476833.,  476861.,  473897.,  476276.,
        473902.,  476275.,  476888.,  476891.,  476274.,  476280.,
        476257.,  476210.,  476231.,  474192.,  476225.,  476256.,
        476161.,  476228.,  476835.,  473368.,  473892.,  476883.,
        476889.,  476884.,  476887.,  473899.,  476885.,  4762

For independent risks :

In [None]:
error_independent = []

for device in df_train['did'].unique() :
    
    df_train_lov = df_train[df_train.did != device]
    df_test_lov = df_train[df_train.did == device]
    
    X_train = df_train_lov.drop(['messid', 'lat', 'lng', 'did'], axis=1)
    y_lat_train = df_train_lov['lat']
    y_lng_train = df_train_lov['lng']
    
    X_test = df_test_lov.drop(['messid', 'lat', 'lng', 'did'], axis=1)
    y_lat_test = df_test_lov['lat']
    y_lng_test = df_test_lov['lng']
    
    clf_lng = ExtraTreesRegressor(n_estimators=10)
    clf_lng.fit(X_train, y_lng_train)
    pred_lng = clf_lng.predict(X_test)
    
    clf_lat = ExtraTreesRegressor(n_estimators=10)
    clf_lat.fit(X_train, y_lat_train)
    pred_lat = clf_lat.predict(X_test)
    
    err_vec = Eval_geoloc(y_lat_test , y_lng_test, pred_lat, pred_lng)
    err = np.percentile(err_vec, 80)
    error_independent.append(err)
    
    print(err)

For dependent risks, by predicting first the longitude :

In [None]:
error_lng = []

for device in df_train['did'].unique() :
    
    df_train_lov = df_train[df_train.did != device]
    df_test_lov = df_train[df_train.did == device]
    
    X_train = df_train_lov.drop(['messid', 'lat', 'lng', 'did'], axis=1)
    y_lat_train = df_train_lov['lat']
    y_lng_train = df_train_lov['lng']
    
    X_test = df_test_lov.drop(['messid', 'lat', 'lng', 'did'], axis=1)
    y_lat_test = df_test_lov['lat']
    y_lng_test = df_test_lov['lng']
    
    clf_lng = ExtraTreesRegressor(n_estimators=10)
    clf_lng.fit(X_train, y_lng_train)
    pred_lng = clf_lng.predict(X_test)
    
    clf_lat = ExtraTreesRegressor(n_estimators=10)
    new_X_train = pd.concat([X_train, y_lng_train], axis=1)
    
    clf_lat.fit(new_X_train, y_lat_train)
    new_X = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(pred_lng).reset_index(drop=True)], axis=1)
    pred_lat = clf_lat.predict(new_X)
    
    err_vec = Eval_geoloc(y_lat_test , y_lng_test, pred_lat, pred_lng)
    err = np.percentile(err_vec, 80)
    error_lng.append(err)
    
    print(err)

For dependent risks, by predicting first the latitude :

In [None]:
error_lat = []

for device in df_train['did'].unique() :
    
    df_train_lov = df_train[df_train.did != device]
    df_test_lov = df_train[df_train.did == device]
    
    X_train = df_train_lov.drop(['messid', 'lat', 'lng', 'did'], axis=1)
    y_lat_train = df_train_lov['lat']
    y_lng_train = df_train_lov['lng']
    
    X_test = df_test_lov.drop(['messid', 'lat', 'lng', 'did'], axis=1)
    y_lat_test = df_test_lov['lat']
    y_lng_test = df_test_lov['lng']
    
    clf_lat = ExtraTreesRegressor(n_estimators=10)
    clf_lat.fit(X_train, y_lat_train)
    pred_lat = clf_lat.predict(X_test)
    
    clf_lng = ExtraTreesRegressor(n_estimators=10)
    new_X_train = pd.concat([X_train, y_lat_train], axis=1)
    
    clf_lng.fit(new_X_train, y_lng_train)
    new_X = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(pred_lat).reset_index(drop=True)], axis=1)
    
    pred_lng = clf_lng.predict(new_X)
    
    err_vec = Eval_geoloc(y_lat_test , y_lng_test, pred_lat, pred_lng)
    err = np.percentile(err_vec, 80)
    error_lat.append(err)
    
    print(err)

In [None]:
print("Independant : " + str(np.median(np.array(error_independent).mean())))
print("Longitude First : " + str(np.median(np.array(error_lng).mean())))
print("Latitude First : " + str(np.median(np.array(error_lat).mean())))

The best model appears to be : Estimating longitude first, and then latitude in the dependent model.