# Aircraft Localization Competition
This notebook compile all the algorithms used for the competition.
Some functions are using multiprocessing which has only be tested on Linux OS.

In [None]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from tqdm import tqdm
import numpy as np
import plotly.express as px
import itertools
import multiprocessing as mp
import plotly.graph_objects as go
import geopy
from plotly.subplots import make_subplots
from pandas.io.json import json_normalize
import json
from itertools import combinations
import geopy.distance
import pickle
from scipy.optimize import fsolve, root
from scipy.stats import iqr
import warnings
from scipy.interpolate import UnivariateSpline

warnings.filterwarnings("ignore")

# Data Loading

In [None]:
path = 'OSN/competition/'
training = pd.read_csv(path+'round1_competition.csv')
sensors = pd.read_csv(path+'sensors.csv')
validation = pd.read_csv(path+'round1_sample_empty.csv')
testing = training.loc[training.id.isin(validation.id)]
training = training.dropna()

# Data Transformation

## Json Expansion
Here the goal is to split the measurements into multiple rows.

In [None]:
def expand_measurements(df):
    # Function to expand the json coumn called measurements

    dfs = []
    def json_to_df(row, json_col):
        json_df = pd.read_json(row[json_col])
        dfs.append(json_df.assign(**row.drop(json_col)))
    df.apply(json_to_df, axis=1, json_col='measurements')
    df = pd.concat(dfs).reset_index().rename(columns={0: 'sensor', 1:'timestamp', 2:'power'})
   
    return df

def expand_measurements_para(df):
    # Here we used multoprocessing to make things faster
    num_processes = mp.cpu_count()
    chunk_size = int((len(df)//num_processes)+1)
    chunks = [df.iloc[i:i + chunk_size]for i in range(0, len(df), chunk_size)]

    with mp.Pool(num_processes) as pool:
        list_df = pool.map(expand_measurements, chunks)
        pool.close()
        pool.join()
    return pd.concat(list_df)



In [None]:
try:
    df_testing = pd.read_pickle('df_testing_clean.pkl')
except:
    df_testing = expand_measurements_para(testing)
    df_testing.to_pickle('df_testing_clean.pkl')
df_testing.head()

In [None]:
try:
    df_training = pd.read_pickle('df_training_clean.pkl')
except:
    df_training = expand_measurements_para(training)
    df_training.to_pickle('df_training_clean.pkl')

df_training.head()

#### We only keep the sensors seen in the test dataset

In [None]:
df_training = df_training.loc[df_training.sensor.isin(df_testing.sensor.values)]

## Get one row per pair of measurement to obtain the delta_time used in multilateration
Here we need to obtain all the combinations of pairs of sensors available for one measurement and compute the difference in time of the received messages

In [None]:
def get_pairs_dt_n_power(df):
    rows_list = []
    for ids, group in df.groupby('id'):
        sensor_pairs = [sorted(t) for t in list(combinations(group.sensor, 2))]
        for pair in sensor_pairs:
            dt_obs = group.loc[group.sensor==pair[0]].timestamp.values[0]-group.loc[group.sensor==pair[1]].timestamp.values[0]
            p0 = group.loc[group.sensor==pair[0]].power.values[0]
            p1 = group.loc[group.sensor==pair[1]].power.values[0]
            tAtServer = group.loc[group.sensor==pair[1]].timeAtServer.values[0]
            lat = group.latitude.values[0]
            lon = group.longitude.values[0]
            baro = group.baroAltitude.values[0]
            geo =  group.geoAltitude.values[0]
            rows_list.append({'id':ids, 's0':pair[0], 's1':pair[1], 'dt_obs':dt_obs,
                              'p0': p0, 'p1': p1, 'timeAtServer': tAtServer,
                              'latitude': lat, 'longitude': lon, 'baroAltitude': baro, 'geoAltitude': geo})
    return pd.DataFrame(rows_list)

def get_pairs_dt_n_power_para(df):
    # Here we use multoprocessing to make things faster
    num_processes = mp.cpu_count()
    grouped = df.groupby('id')
    list_groups = [g[1] for g in list(grouped)]
    chunk_size = int((len(list_groups)//num_processes)+1)                      
    chunks = [pd.concat(list_groups[i:i+chunk_size]) for i in range(0, len(list_groups), chunk_size)]

    with mp.Pool(num_processes) as pool:
        list_df = pool.map(get_pairs_dt_n_power, chunks)
        pool.close()
        pool.join()
    return pd.concat(list_df)

In [None]:
try:
    X_test = pd.read_pickle('X_test_clean.pkl')
except:
    X_test = get_pairs_dt_n_power_para(df_testing)
    X_test.to_pickle('X_test_clean.pkl')
X_test.head()

In [None]:
try:
    X_train = pd.read_pickle('X_train_clean.pkl')
except:
    X_train = get_pairs_dt_n_power_para(df_training)
    X_train.to_pickle('X_train_clean.pkl')
X_train.head()

# Compute the theoretical dt for the training dataset

In [None]:
def get_distance_diff(s0, s1, ac_pos):
    s0 = tuple(s0.values[0])
    s1 = tuple(s1.values[0])
    ac_pos = tuple(ac_pos.values)
    ds0_ac_m = np.sqrt(geopy.distance.distance(s0[:2], ac_pos[:2]).m**2 + (s0[2]-ac_pos[2])**2)
    ds1_ac_m = np.sqrt(geopy.distance.distance(s1[:2], ac_pos[:2]).m**2 + (s1[2]-ac_pos[2])**2)

    return ds0_ac_m-ds1_ac_m

def get_dt_calc(df):
    dt_calcs = []
    c = 0.2995 # Speed of transmission in m/ns
    for _, row in df.iterrows():
        s0 = s0_lon, s0_lat, s0_alt = sensors.loc[sensors.serial==row.s0][['latitude', 'longitude', 'height']]
        s1 = s1_lon, s1_lat, s1_alt = sensors.loc[sensors.serial==row.s1][['latitude', 'longitude', 'height']]
        ac_pos = row[['latitude' , 'longitude', 'geoAltitude']]
        doffset_m = get_distance_diff(s0, s1, ac_pos)
        dt_calcs.append(doffset_m/c)
    df['dt_calc'] = dt_calcs
    return df

def get_dt_calc_para(df, sens):
    global sensors
    sensors = sens
    # Here we use multoprocessing to make things faster
    num_processes = mp.cpu_count()
    chunk_size = int((len(df)//num_processes)+1)                      
    chunks = [df.iloc[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

    with mp.Pool(num_processes) as pool:
        list_df = pool.map(get_dt_calc, chunks)
        pool.close()
        pool.join()
    return pd.concat(list_df)

In [None]:
try:
    X_train_calc = pd.read_pickle('X_train_calc_clean.pkl')
except:
    X_train_calc = get_dt_calc_para(X_train, sensors)
    X_train_calc.to_pickle('X_train_calc_clean.pkl')
X_train = X_train_calc
X_train.head()


In [None]:
X_train = X_train.reset_index()

# Comparaison between theory and obesrvations for each pair of sensors and offset correction
Due to the fact that sensor clocks are supposed to be synchronized, it is surprising to see that for some pairs of sensors it looks like there is an offset. We will correct this using the median.

In [None]:
X_train['dt_corrected'] = np.nan
X_test['dt_corrected'] = np.nan
for s, group in X_train.groupby(['s0','s1']):
    diffs = group.dt_obs-group.dt_calc
    print(s, np.quantile(diffs, 0.1), np.median(diffs), np.quantile(diffs, 0.9))
    X_train.loc[group.index, 'dt_corrected'] = (group.dt_obs -np.median(diffs)).values
    diffs2 = X_train.loc[group.index].dt_corrected-group.dt_calc
    print(s, np.quantile(diffs2, 0.1), np.median(diffs2), np.quantile(diffs2, 0.9))
    X_test.loc[(X_test.s0 == s[0]) & (X_test.s1 == s[1]), 'dt_corrected'] = X_test.loc[(X_test.s0 == s[0]) & (X_test.s1 == s[1])].dt_obs-np.median(diffs)

Let's have a look at the fiffrence between the corrected and the calculated:

In [None]:
diffs = X_train.dt_calc - X_train.dt_corrected
px.histogram(diffs.sample(frac =.01))

As we can see there are some HUGE outliers, we will try to remove thos ones by keeping only values which are closer to 1000ns of difference.  (1000ns corresponds to 300m difference)

In [None]:
X_train_filtered = X_train.loc[np.abs(diffs)<1000]
diffs = X_train_filtered.dt_calc - X_train_filtered.dt_corrected
px.histogram(diffs.sample(frac =.01))

### Now it is filtered, we can do the same deoffset  again
In case the median was still influenced by outliers

In [None]:
X_test['dt_corrected2'] = np.nan
X_train['dt_corrected2'] = np.nan
grouped = X_test.groupby(['s0','s1'])
pairs = [g[0] for g in list(grouped)]
# Then we compute the difference for each pairs
for s0, s1 in pairs:
    pair = (s0, s1)
    X_train_pairs = X_train_filtered.loc[(X_train_filtered.s0==s0) & (X_train_filtered.s1==s1)]
    if len(X_train_pairs)<1:
        print('missed pair:', pair)
        continue
    diffs = X_train_pairs['dt_calc']-X_train_pairs['dt_corrected']
    print('Pair:', (s0, s1), 'quantiles:', np.quantile(diffs, [0.1, 0.5, 0.9]), np.mean(diffs))
    X_train_filtered.loc[(X_train_filtered.s0==s0) 
                         & (X_train_filtered.s1==s1),
                         'dt_corrected2'] = X_train_filtered.loc[(X_train_filtered.s0==s0)
                                                                  & (X_train_filtered.s1==s1)].dt_corrected+np.median(diffs)
    X_test.loc[(X_test.s0==s0) & (X_test.s1==s1), 'dt_corrected2'] = X_test.loc[(X_test.s0==s0)
                                                                         & (X_test.s1==s1)].dt_corrected+np.median(diffs)


In [None]:
diffs = X_train_filtered.dt_calc - X_train_filtered.dt_corrected2
px.histogram(diffs.sample(frac =.05))

In [None]:
X_train_filtered.to_pickle('X_train_filtered.pkl')

Bingo This is nice now! The problem is that we have some pairs of sensors which have completly been ruled out fue to the filtering

In [None]:
X_test['dt_corrected2'] = X_test['dt_corrected2'].fillna(X_test['dt_corrected'])

In [None]:
X_train_filtered.head()

In [None]:
features = ['s0', 's1', 'latitude', 'longitude', 'timeAtServer', 'p0', 'p1', 'baroAltitude', 'dt_obs', 'dt_corrected','dt_corrected2']
X_training = X_train_filtered[features].dropna().copy()
X_training["s0"] = X_training["s0"].astype('category')
X_training["s1"] = X_training["s1"].astype('category')
Y = X_train_filtered.dropna()['dt_calc']

In [None]:
print('RMSE baro/geo:', np.sqrt(mean_squared_error(Y, X_training.dt_corrected)))

# GeoAltitude Estimation
We will use a light gradient boosting regression to predict the geoAltitude. We use the half / half methode where we train on 50% and test on the rest. Then we train a second model on the oposite datasets and at the end we combine the two results by taking the mean.

In [None]:
features = ['s0', 's1', 'timeAtServer', 'p0', 'p1', 'baroAltitude', 'dt_corrected2']

X_training = X_train_filtered[features].copy()
X_training["s0"] = X_training["s0"].astype('category')
X_training["s1"] = X_training["s1"].astype('category')
Y = X_train_filtered['geoAltitude']

X_testing = X_test[features].copy()
X_testing["s0"] = X_testing["s0"].astype('category')
X_testing["s1"] = X_testing["s1"].astype('category')



What is the rmse between the geoAltitude and the barometric one:

In [None]:
print('RMSE baro/geo:', np.sqrt(mean_squared_error(Y, X_training.baroAltitude)))

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 10,
    'num_leaves': 31,
    'learning_rate': 1,
    'verbose': 0, 
    'early_stopping_round': 50,
    }
n_estimators = 100000

# We will train 2 models each on one half of the data and sum it up at the end
x_train, x_valid, y_train, y_valid = train_test_split(X_training, Y, test_size=0.5, random_state=1)
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
watchlist = [d_valid]
model1 = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

# We flip the training and testing
x_train, x_valid = x_valid, x_train
y_train, y_valid = y_valid, y_train

# And we train the second model
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
watchlist = [d_valid]
model2 = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

In [None]:
# preds = (model1.predict(X_training)+model2.predict(X_training))/2
# X_train_filtered['geoAlt_pred'] = preds
# preds = X_train_filtered.join(X_train_filtered[['id', 'geoAlt_pred']].groupby('id').median(), on='id', rsuffix='_med')
# print('RMSE predict/geo:', np.sqrt(mean_squared_error(preds.geoAlt_pred_med, X_train_filtered.geoAlt_pred)))

In [None]:
X_test['geoAlt_pred'] = (model1.predict(X_testing) + model2.predict(X_testing)) / 2
preds = X_test.join(X_test[['id', 'geoAlt_pred']].groupby('id').median(), on='id', rsuffix='_med')
X_test['geoAltitude'] = preds.geoAlt_pred_med.values
X_test.head()

# Estimating latitude / longitude using gradient boosting

In [None]:
features = ['s0', 's1', 'timeAtServer', 'p0', 'p1', 'geoAltitude', 'dt_corrected2']

X_training = X_train_filtered[features].copy()
X_training["s0"] = X_training["s0"].astype('category')
X_training["s1"] = X_training["s1"].astype('category')
Y = X_train_filtered['latitude']

X_testing = X_test[features].copy()
X_testing["s0"] = X_testing["s0"].astype('category')
X_testing["s1"] = X_testing["s1"].astype('category')

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 5,
    'num_leaves': 31,
    'learning_rate': 1,
    'verbose': 0, 
    'early_stopping_round': 50,
    }
n_estimators = 2000

# We will train 2 models each on one half of the data and sum it up at the end
x_train, x_valid, y_train, y_valid = train_test_split(X_training, Y, test_size=0.5, random_state=1)
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
watchlist = [d_valid]
model_lat1 = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

# We flip the training and testing
x_train, x_valid = x_valid, x_train
y_train, y_valid = y_valid, y_train

# And we train the second model
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
watchlist = [d_valid]
model_lat2 = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

### We check the results on the training

In [None]:
preds = (model_lat1.predict(X_training)+model_lat2.predict(X_training))/2
X_train_filtered['lat_pred'] = preds

In [None]:
prediction_lat = X_train_filtered[['id', 'latitude', 'lat_pred']].groupby('id').median()
print('RMSE predict/geo:', np.sqrt(mean_squared_error(prediction_lat.latitude, prediction_lat.lat_pred)))

In [None]:
px.histogram(prediction_lat['lat_pred']-prediction_lat['latitude'])

It's far from being good but it can be used for first guess during multilateration solving

### We do the same for the longitude

In [None]:
features = ['s0', 's1', 'timeAtServer', 'p0', 'p1', 'geoAltitude', 'dt_corrected2']

X_training = X_train_filtered[features].copy()
X_training["s0"] = X_training["s0"].astype('category')
X_training["s1"] = X_training["s1"].astype('category')
Y = X_train_filtered['longitude']

X_testing = X_test[features].copy()
X_testing["s0"] = X_testing["s0"].astype('category')
X_testing["s1"] = X_testing["s1"].astype('category')

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 5,
    'num_leaves': 31,
    'learning_rate': 1,
    'verbose': 0, 
    'early_stopping_round': 50,
    }
n_estimators = 2000

# We will train 2 models each on one half of the data and sum it up at the end
x_train, x_valid, y_train, y_valid = train_test_split(X_training, Y, test_size=0.5, random_state=1)
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
watchlist = [d_valid]
model_lon1 = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

# We flip the training and testing
x_train, x_valid = x_valid, x_train
y_train, y_valid = y_valid, y_train

# And we train the second model
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
watchlist = [d_valid]
model_lon2 = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

### We check the results on the training

In [None]:
preds = (model_lon1.predict(X_training)+model_lon2.predict(X_training))/2
X_train_filtered['lon_pred'] = preds

In [None]:
prediction_lon = X_train_filtered[['id', 'longitude', 'lon_pred']].groupby('id').median()
print('RMSE predict/geo:', np.sqrt(mean_squared_error(prediction_lon.longitude, prediction_lon.lon_pred)))

In [None]:
px.histogram(prediction_lon['lon_pred']-prediction_lon['longitude'])

## We predict the latitude and longitdude on the testing

In [None]:
X_test['latitude'] = (model_lat1.predict(X_testing)+model_lat2.predict(X_testing))/2
X_test['longitude'] = (model_lon1.predict(X_testing)+model_lon2.predict(X_testing))/2
X_test.head()

In [None]:
prediction_lat_lon = X_test[['id', 'latitude', 'longitude']].groupby('id').median()
prediction_lat_lon.head()
testing[['latitude', 'longitude']] = prediction_lat_lon.values
testing.head()

# Multilateration

In this part we will apply multilateration to try to find aircraft positions.
The strategy is to compute a solution for each triplet of sensors that belong to the same measurement.
Ny doing so we can then see if all solutions are close or if some are outliers (in the modified z score sense). Maybe we can identify fauty sensor by doing so.
Then we retrieve all the sensors which are not outliers in the sense of the modified z score and we solve the multilateration equation with all of them at once.

#### To solve the equations, we need a first guess. We retriebe the lgbm results and we interpolate them with a polynom of order 9.

In [None]:
df_guess = testing[['latitude', 'longitude', 'id']]
df_guess['lat_interp'] = np.nan
df_guess['lon_interp'] = np.nan
for ac, group in testing.groupby('aircraft'): 
    t = group.timeAtServer.values
    plon = np.poly1d(np.polyfit(t, group.longitude, 9))
    plat = np.poly1d(np.polyfit(t, group.latitude, 9))
    new_lons = plon(t)
    new_lats = plat(t)
    df_guess.loc[group.index, 'latitude'] = new_lats
    df_guess.loc[group.index, 'longitude'] = new_lons
df_guess.head()
best_old = df_guess

### Functions definitions for the multilateration

In [None]:
def mod_z(val):
    # Funciton to compute the modified z score
    med = np.median(val)
    med_abs_dev = np.median((np.abs(val - med)))
    mod_z = 0.7413 * ((val - med) / med_abs_dev)
    return np.abs(mod_z)


def equation0102(p, *data):
    group, triplet = data
    id0, id1, id2 = triplet
    c=0.2995 # Transmission speed
    x, y = p
    dt01 = group.loc[(group.s0==id0) & (group.s1==id1)].dt_corrected2.values[0]
    dt02 = group.loc[(group.s0==id0) & (group.s1==id2)].dt_corrected2.values[0]
    dt12 = group.loc[(group.s0==id1) & (group.s1==id2)].dt_corrected2.values[0]


    s0 = sensors.loc[sensors.serial==id0][['latitude', 'longitude']].values
    s1 = sensors.loc[sensors.serial==id1][['latitude', 'longitude']].values
    s2 = sensors.loc[sensors.serial==id2][['latitude', 'longitude']].values


    ap_alt = group.geoAltitude.values[0]
    vert0 = ap_alt-sensors.loc[sensors.serial==id0].height.values[0]
    vert1 = ap_alt-sensors.loc[sensors.serial==id1].height.values[0]
    vert2 = ap_alt-sensors.loc[sensors.serial==id2].height.values[0]
    return (
            (np.sqrt(geopy.distance.distance(s0, (x, y)).m**2 + vert0**2)
             -np.sqrt(geopy.distance.distance(s1, (x, y)).m**2 + vert1**2))/c-dt01,
            (np.sqrt(geopy.distance.distance(s0, (x, y)).m**2 + vert0**2)
             -np.sqrt(geopy.distance.distance(s2, (x, y)).m**2 + vert2**2))/c-dt02
    )

def equation0212(p, *data):
    group, triplet = data
    id0, id1, id2 = triplet
    c=0.2995 # Transmission speed
    x, y = p
    dt01 = group.loc[(group.s0==id0) & (group.s1==id1)].dt_corrected2.values[0]
    dt02 = group.loc[(group.s0==id0) & (group.s1==id2)].dt_corrected2.values[0]
    dt12 = group.loc[(group.s0==id1) & (group.s1==id2)].dt_corrected2.values[0]


    s0 = sensors.loc[sensors.serial==id0][['latitude', 'longitude']].values
    s1 = sensors.loc[sensors.serial==id1][['latitude', 'longitude']].values
    s2 = sensors.loc[sensors.serial==id2][['latitude', 'longitude']].values


    ap_alt = group.geoAltitude.values[0]
    vert0 = ap_alt-sensors.loc[sensors.serial==id0].height.values[0]
    vert1 = ap_alt-sensors.loc[sensors.serial==id1].height.values[0]
    vert2 = ap_alt-sensors.loc[sensors.serial==id2].height.values[0]
    return (
            (np.sqrt(geopy.distance.distance(s0, (x, y)).m**2 + vert0**2)
             -np.sqrt(geopy.distance.distance(s2, (x, y)).m**2 + vert2**2))/c-dt02,
            (np.sqrt(geopy.distance.distance(s1, (x, y)).m**2 + vert1**2)
             -np.sqrt(geopy.distance.distance(s2, (x, y)).m**2 + vert2**2))/c-dt12
    )

def equation_all_in(p, group):
    # This function is used to generate equations to solve using all the sensors availables
    c = 0.2995 # Transmission speed
    x, y = p
    dts = []
    ap_alt = group.geoAltitude.values[0]
    list_sensors = sorted(set(group[['s0', 's1']].values.ravel())) # We retrieve all the sensors
    sensor_pairs = [sorted(t) for t in list(combinations(list_sensors, 2))] # We generate all the pair of sensors
    # We retrieve all the dt for each pair of sensor
    for pair in sensor_pairs:
        dts.append(group.loc[(group.s0==pair[0]) & (group.s1==pair[1])].dt_corrected2.values[0])

    # We retrieve all the sensors positions as well as the vertical differences with the airplane
    s_pos = [sensors.loc[sensors.serial==id0][['latitude', 'longitude']].values for id0 in list_sensors]
    vert_diff = [ap_alt-sensors.loc[sensors.serial==id0].height.values[0]  for id0 in list_sensors]
    # We append a list of equations with one eqation for each pair.
    eq = []
    for i, pair in enumerate(sensor_pairs):
        index0, index1 = list_sensors.index(pair[0]), list_sensors.index(pair[1])
        eq.append((np.sqrt(geopy.distance.distance(s_pos[index0], (x, y)).m**2 + vert_diff[index0]**2)
                 -np.sqrt(geopy.distance.distance(s_pos[index1], (x, y)).m**2 + vert_diff[index1]**2))/c-dts[i])
    return eq


def apply_multilateration3(list_ids):
    dico_res = {}
    for ids in tqdm(list_ids):
        group = X_test2.loc[X_test2.id == ids]
        
        # Knowing the location of 3 sensors and the dt between each pair, we can deduct the aircraft position
        # Note that with 3 sensors, there are 2 possibilities of solution, hence why we have 2 equations
        
        if len(group)>1:
            
            # We generate all the triplet of sensors combinations for a specific measurement id
            triplet_sensors = [sorted(x) for x in list(itertools.combinations(set(group.s0.tolist() + group.s1.tolist()), 3))]
            lats, lons, trip = [], [], []
            for triplet in triplet_sensors:
                # We retrieve the best guess for this measurement
                guess = tuple(best_old.loc[best_old.id==ids][['latitude', 'longitude']].values[0])
                try:
                    # We solve the aircraft position
                    roots0102 =  fsolve(equation0102, guess, args=(group, triplet))             
                    roots0212 =  fsolve(equation0212, guess, args=(group, triplet))
                except:
                    continue
                
                # If the difference with the guess is greater than 5deg we consider it's an outlier
                error_max=5 
                if np.abs(guess[0]-roots0102[0])<error_max and np.abs(guess[1]-roots0102[1])<error_max:
                    lats.append(roots0102[0])
                    lons.append(roots0102[1])
                    trip.append(triplet)
                if np.abs(guess[0]-roots0212[0])<error_max and np.abs(guess[1]-roots0212[1])<error_max:
                    lats.append(roots0212[0])
                    lons.append(roots0212[1])
                    trip.append(triplet)
        
            # Our new guess becomes the median of all the results we have obtained
            guess = np.median(lats), np.median(lons)
            
            # We store the obtained results in the dict of results
            dico_res[ids] = (lats, lons, trip, np.nan, np.nan, np.nan, np.nan)
            try:
                # We transform the dict in a dataframe for easier manipulations and compute the z score for lats and lons
                df = pd.DataFrame(np.column_stack(dico_res[ids][:3]), columns=['lats','lons', 's0', 's1', 's2'])
                df['mod_z_lat'] = mod_z(lats)
                df['mod_z_lon'] = mod_z(lons)
                # We filter out all the measurements ehich have a z score greater than 3.5
                sensors_filtered = df.loc[~((df.mod_z_lat>3.5) | (df.mod_z_lon>3.5))][['s0', 's1', 's2']].values.ravel()
                # Then we retrieve the measurements of pairs of sensors which are in the filtered sensors
                group_filtered = group.loc[(group.s0.isin(sensors_filtered)) & (group.s1.isin(sensors_filtered))]

                # First we try to solve the multilateration problem on the filtered group
                # We compute the solution using root method and store it in the dictionary
                all_in_filt_sol = root(equation_all_in, guess, args=group_filtered, method='lm').x # Retrieve lat and lon solution of the set of equations
                dico_res[ids] = (lats, lons, trip, all_in_filt_sol[0], all_in_filt_sol[1], np.nan, np.nan)
                
            except:
                pass
            
            try:
                # Then we try to solve the multilateration problem on the unfiltered group and store the result in the dict
                all_in_sol = root(equation_all_in, guess, args=group, method='lm').x
                dico_res[ids] = (lats, lons, trip, all_in_filt[0], all_in_filt[1], all_in_sol[0], all_in_sol[1])
            except:
                pass

    return dico_res


In [None]:
try:
    with open('dico_res_full_clean_v2_allin.pickle', 'rb') as handle:
        dico_res = pickle.load(handle)
except:
    pass
    global X_test2
    X_test2 = X_test.copy()

    list_ids = testing.id.values.tolist()
    num_processes = mp.cpu_count()

    # calculate the chunk size as an integer
    chunk_size = int((len(list_ids)//num_processes)+1)

    # Create the chunks
    chunks = [list_ids[i:i + chunk_size]for i in range(0, len(list_ids), chunk_size)]

    with mp.Pool(num_processes) as pool:
        list_dicos = pool.map(apply_multilateration3, chunks)
        pool.close()
        pool.join()
    dico_res = {k: v for d in list_dicos for k, v in d.items()}

#     with open('dico_res_full_clean_v2_allin.pickle', 'wb') as handle:
#         pickle.dump(dico_res, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Multilateration Result analysis

In [None]:
res = pd.DataFrame.from_dict(dico_res, orient='index', columns=['lats', 'lons', 'triplet', 'lat_all_in_filt',
                                                               'lon_all_in_filt', 'lat_all_in', 'lon_all_in'])
res['lat_med'] = res.lats.apply(lambda x: np.median(x))
res['lat_mean'] = res.lats.apply(lambda x: np.mean(x))
res['lon_med'] = res.lons.apply(lambda x: np.median(x))
res['lon_mean'] = res.lons.apply(lambda x: np.mean(x))
res['n_res'] = res.lons.apply(lambda x: len(x))
res['lats_iqr'] = res.lats.apply(lambda x: iqr(x))
res['lons_iqr'] = res.lons.apply(lambda x: iqr(x))

In [None]:
res.head(2)

## Filtering
Heere we will only keep the results where er have at least 9 triplet of sensors giving a solution and when the IQR in below a certain treshold.

In [None]:
n_res_min = 8
lats_iqr_max = 0.0008151

res = res.loc[(res.n_res>n_res_min) & (res.lats_iqr<lats_iqr_max)]
res_merged = testing.set_index('id').join(res).dropna(subset=['lat_med'])
res_merged.head(2)

Here we only keep the rows where the median of the triplet results is very close to the "all in" one.

In [None]:
res_merged = res_merged.loc[np.abs(res_merged.lat_med-res_merged.lat_all_in_filt)<100E-6]

## Filtering and smoothing at trajectory level

First we define utility functions. The first one will split trajectories into segments when there is no measurement for more than dt seconds. The second one will check that the speed of an airplane betwwen 2 measurements is possible.

In [None]:
from scipy.interpolate import pchip_interpolate
from scipy.interpolate import CubicSpline

testing['latitude'] = np.nan
testing['longitude'] = np.nan

def split_times(t, dt=15):
    # Split e time vector into chunks when the delat between 2 timesamps is greater than dt
    diffs = [0]+ list(np.diff(t))
    L = [ti[1] if diffs[ti[0]] <dt else 'split' for ti in enumerate(t)]
    from itertools import groupby

    # define separator keys
    def split_condition(x):
        return x in {'split'}

    # define groupby object
    grouper = groupby(L, key=split_condition)

    # convert to dictionary via enumerate
    return dict(enumerate((list(j) for i, j in grouper if not i), 1))


def filter_speed(res_ac2, min_speed=50, max_speed=300):
    # Check that the ground speed between 2 estimations is consistant. If not it removes the faulty one.
    # Speeds are converted to m/s
    lat_med = np.median(res_ac2.lat_all_in_filt)
    lon_med = np.median(res_ac2.lon_all_in_filt)
    # We convert a degree of lat and lon into meters
    delta_lon_deg = geopy.distance.distance((lat_med, lon_med), (lat_med, lon_med+1)).m
    delta_lat_deg = geopy.distance.distance((lat_med, lon_med), (lat_med+1, lon_med)).m
    
    speeds_lat = np.diff(res_ac2.lat_all_in_filt)*delta_lat_deg / np.diff(res_ac2.timeAtServer)
    speeds_lon = np.diff(res_ac2.lon_all_in_filt)*delta_lon_deg / np.diff(res_ac2.timeAtServer)

    # We check that the speeds are within the limits
    speed_cond = (min_speed < np.sqrt(speeds_lat**2 + speeds_lon**2)) & (np.sqrt(speeds_lat**2 + speeds_lon**2)<max_speed)
    
    # We use recursion to remove outliers
    # We retrieve the index of the faulty ones
    index_faulty = [i for i, x in enumerate(speed_cond) if ~x]
    if len(index_faulty) > 0:
        # Could have been done much nicer ...
        speed_cond = [True] * len(speed_cond) # We generate a dummy speed cond array
        speed_cond[index_faulty[0]] = False # And we mark as faulty the first faulty one
        res_ac2 = res_ac2.iloc[:-1].loc[speed_cond]
    
    if len(index_faulty)>1:
        res_ac2 = filter_speed(res_ac2)
        
    return res_ac2


In [None]:
cpt = 0
for ac, group in testing.groupby('aircraft'):
    fig = make_subplots(rows=1, cols=2)
    # We retrieve the full time vector of the traj
    time = testing.loc[testing.aircraft==ac].timeAtServer.values
    
    # We retrieve the part of the results that belongs to the aircraft traj
    res_multilateration = res.loc[res.index.isin(group.id.values)]

    if len(res_multilateration) > 0:
        
        res_ac = res_merged.loc[res_merged.aircraft==ac].copy()
        res_ac = res_ac.groupby('timeAtServer', group_keys=False, as_index=False).apply(lambda x: x.loc[x.n_res.idxmax()])
        res_ac = res_ac.sort_values(by=['timeAtServer'])


        # Time vector of the points for which we have an estimate
        t = res_ac.timeAtServer.values

        # We split the traj in multiple segments if there is a gap of dt sec:
        dico_t = split_times(t, dt=25.5)
        
        # Now we iterate each segment of the trajectory
        for key in dico_t:
            t = dico_t[key]
            
            # Very ugly trick to minimize the score based on visual impression
            if ac == 149 and min(t) < 1200:
                continue
            if ac == 1429 and min(t) < 2800:
                continue
            
            # We make sure that the trajector has at least 2 measurements:
            if len(t) < 2:
                continue
                
            res_ac = filter_speed(res_ac)      
            res_ac_filtered = res_ac.loc[res_ac.timeAtServer.isin(t)]
            if len(res_ac_filtered)<2:
                continue
            t = res_ac_filtered.timeAtServer.values
            
            # Interpolation between the first and lat timestamp of each chunk
            time_inter = [tim for tim in time if np.min(t)<= tim <= np.max(t)]
            new_lons = pchip_interpolate(t, res_ac_filtered.lon_all_in_filt, time_inter)
            new_lats = pchip_interpolate(t, res_ac_filtered.lat_all_in_filt, time_inter)

            # Smooting using splines ig there are more than 5 measurements in the segment
            if len(new_lons) > 5:
                spl = UnivariateSpline(time_inter, new_lons, k=5, s=0.000018)
                new_lons = spl(time_inter)
                spl = UnivariateSpline(time_inter, new_lats, k=5, s=0.000018)
                new_lats = spl(time_inter)

            
            # We push the results into the testing dataframe
            testing.loc[(testing.aircraft==ac) & (testing.timeAtServer.isin(time_inter)), 'latitude']  = new_lats
            testing.loc[(testing.aircraft==ac) & (testing.timeAtServer.isin(time_inter)), 'longitude']  = new_lons
            
            # Some ploting
            fig.add_trace(go.Scatter(x=t, y=res_ac_filtered.lat_all_in_filt, mode="markers", name="lat_median"), row=1, col=1)
            fig.add_trace(go.Scatter(x=time_inter, y=new_lats, name="lat_interp"),    row=1, col=1)
            fig.add_trace(go.Scatter(x=t, y=res_ac_filtered.lon_all_in_filt, mode="markers", name="lon_median"), row=1, col=2)
            fig.add_trace(go.Scatter(x=time_inter, y=new_lons, name="lon_interp"), row=1, col=2)
            
            cpt += len(time_inter)
            
        fig.update_layout(title=str(ac))
        fig.show()


In [None]:
print('percentage:', cpt/len(testing)*100)
# if cpt/len(testing) >= 0.5:
#     print('saved')
#     testing[['id', 'latitude', 'longitude', 'geoAltitude']].to_csv('test_31_07_{}_{}.csv'.format(n_res_min, lats_iqr_max),  index=False)