In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/feature-eng/__results__.html
/kaggle/input/feature-eng/trainset_pandas.csv
/kaggle/input/feature-eng/total_trainset_pandas.csv
/kaggle/input/feature-eng/__notebook__.ipynb
/kaggle/input/feature-eng/valset_pandas.csv
/kaggle/input/feature-eng/__output__.json
/kaggle/input/feature-eng/testset_pandas.csv
/kaggle/input/feature-eng/custom.css
/kaggle/input/feature-eng/test/2021-04-29-US-MTV-2/SamsungS20Ultra/Status.csv
/kaggle/input/feature-eng/test/2021-04-29-US-MTV-2/SamsungS20Ultra/UncalGyro.csv
/kaggle/input/feature-eng/test/2021-04-29-US-MTV-2/SamsungS20Ultra/UncalMag.csv
/kaggle/input/feature-eng/test/2021-04-29-US-MTV-2/SamsungS20Ultra/UncalAccel.csv
/kaggle/input/feature-eng/test/2021-04-29-US-MTV-2/SamsungS20Ultra/Raw.csv
/kaggle/input/feature-eng/test/2021-04-29-US-MTV-2/SamsungS20Ultra/Fix.csv
/kaggle/input/feature-eng/test/2021-04-29-US-MTV-2/Pixel5/Status.csv
/kaggle/input/feature-eng/test/2021-04-29-US-MTV-2/Pixel5/UncalGyro.csv
/kaggle/input/feature-eng/test/202

In [2]:
! pip3 install simdkalman

import lightgbm as lgb 
import random 
from tqdm import tqdm
import joblib
import tensorflow as tf
from scipy.interpolate import interp1d 
import simdkalman 

Collecting simdkalman
  Downloading simdkalman-1.0.1-py2.py3-none-any.whl (11 kB)
Installing collected packages: simdkalman
Successfully installed simdkalman-1.0.1


In [3]:
def check_random(SEED=2021):
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['PYTHONHASHSEED'] = str(SEED)
    random.seed(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

class KF():
    def __init__(self):
        T = 1.0
        state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0],
                                     [0, 1, 0, T, 0, 0.5 * T ** 2],
                                     [0, 0, 1, 0, T, 0],
                                     [0, 0, 0, 1, 0, T],
                                     [0, 0, 0, 0, 1, 0],
                                     [0, 0, 0, 0, 0, 1]])
        process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
        observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
        observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9
        self.kf = simdkalman.KalmanFilter(
            state_transition=state_transition,
            process_noise=process_noise,
            observation_model=observation_model,
            observation_noise=observation_noise)

    def apply_kf_smoothing(self, df, suffix):
        phones = df["phone"].drop_duplicates().tolist()
        for phone in tqdm(phones):
            cond = df['phone'] == phone
            tmp = df[cond].copy()
            tmp[0] = tmp["millisSinceGpsEpoch"] // 1000
            tmp = tmp.merge(pd.DataFrame(range(tmp[0].min(), tmp[0].max() + 1)), on=[0], how="right")
            tmp_np = tmp[['latDeg', 'lngDeg']].to_numpy()
            nan_idxs = tmp[tmp["millisSinceGpsEpoch"].isnull()].index.to_list()
            tmp_np = tmp_np.reshape(1, len(tmp_np), 2)
            smoothed = self.kf.smooth(tmp_np).states.mean
            smoothed = np.delete(smoothed, list(nan_idxs), 1)
            df.loc[cond, 'latDeg' + suffix] = smoothed[0, :, 0]
            df.loc[cond, 'lngDeg' + suffix] = smoothed[0, :, 1] 

def mean_with_other_phones(df):
    """
    https://www.kaggle.com/bpetrb/adaptive-gauss-phone-mean
    """
    collections_list = df[['collectionName']].drop_duplicates().to_numpy()

    for collection in collections_list:
        phone_list = df[df['collectionName'].to_list() == collection][['phoneName']].drop_duplicates().to_numpy()

        phone_data = {}
        corrections = {}
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            phone_data[phone[0]] = df[cond][['millisSinceGpsEpoch', 'latDeg', 'lngDeg']].to_numpy()

        for current in phone_data:
            correction = np.ones(phone_data[current].shape, dtype=np.float)
            correction[:, 1:] = phone_data[current][:, 1:]

            # Telephones data don't complitely match by time, so - interpolate.
            for other in phone_data:
                if other == current:
                    continue

                loc = interp1d(phone_data[other][:, 0],
                               phone_data[other][:, 1:],
                               axis=0,
                               kind='linear',
                               copy=False,
                               bounds_error=None,
                               fill_value='extrapolate',
                               assume_sorted=True)

                start_idx = 0
                stop_idx = 0
                for idx, val in enumerate(phone_data[current][:, 0]):
                    if val < phone_data[other][0, 0]:
                        start_idx = idx
                    if val < phone_data[other][-1, 0]:
                        stop_idx = idx

                if stop_idx - start_idx > 0:
                    correction[start_idx:stop_idx, 0] += 1
                    correction[start_idx:stop_idx, 1:] += loc(phone_data[current][start_idx:stop_idx, 0])

            correction[:, 1] /= correction[:, 0]
            correction[:, 2] /= correction[:, 0]

            corrections[current] = correction.copy()

        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()

            df.loc[cond, ['latDeg', 'lngDeg']] = corrections[phone[0]][:, 1:]

    return df

In [4]:
check_random()

In [5]:
train_pd = pd.read_csv("../input/keras-ann-dis-speed/train_pd.csv")
val_pd = pd.read_csv("../input/keras-ann-dis-speed/val_pd.csv")
test = pd.read_csv("../input/keras-ann-dis-speed/test.csv")
test_data = pd.read_csv("../input/feature-eng/testset_pandas.csv")
feature_cols = list(np.load("../input/keras-ann-dis-speed/feature_cols.npy"))

In [6]:
train_pd['dis_lat'] = train_pd['latDeg'] - train_pd['latDeg_truth']
train_pd['dis_lng'] = train_pd['lngDeg'] - train_pd['lngDeg_truth']
val_pd['dis_lat'] = val_pd['latDeg'] - val_pd['latDeg_truth']
val_pd['dis_lng'] = val_pd['lngDeg'] - val_pd['lngDeg_truth']

In [7]:
th_speed_normal = (6.75 + 7.5) / 2 
th_dist_normal = (13.5 + 15) / 2

# th_speed_normal = 16
# th_dist_normal = 13

In [8]:
train_regression = train_pd.loc[((train_pd["dist_between_baseline_and_truth"] <= th_dist_normal) & (train_pd["speed_between_baseline_and_truth"] <= th_speed_normal))].copy()
val_regression = val_pd.loc[((val_pd["dist_between_baseline_and_truth"] <= th_dist_normal) & (val_pd["speed_between_baseline_and_truth"] <= th_speed_normal))].copy()

In [9]:
train_regression.shape, val_regression.shape

((68981, 142), (54302, 142))

In [10]:
xtr = train_regression[feature_cols]
ytr = train_regression[['dis_lat', 'dis_lng']]
xval = val_regression[feature_cols]
yval = val_regression[['dis_lat', 'dis_lng']]

In [11]:
params = {
    'objective': 'regression_l1',
    'max_bin': 600,
    'learning_rate': 0.01,
    'num_leaves': 80
}

lgb_train = lgb.Dataset(xtr, ytr['dis_lat'])
lgb_eval = lgb.Dataset(xval, yval['dis_lat'], reference=lgb_train)
model_lat = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      verbose_eval=25,
                      num_boost_round=10000,
                      early_stopping_rounds=10
)

joblib.dump(model_lat, 'model_lat.pkl')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43472
[LightGBM] [Info] Number of data points in the train set: 68981, number of used features: 83
[LightGBM] [Info] Start training from score 0.000004
Training until validation scores don't improve for 10 rounds
[25]	training's l1: 1.36482e-05	valid_1's l1: 1.34258e-05
[50]	training's l1: 1.26385e-05	valid_1's l1: 1.30122e-05
[75]	training's l1: 1.18532e-05	valid_1's l1: 1.27558e-05
[100]	training's l1: 1.12479e-05	valid_1's l1: 1.25848e-05
[125]	training's l1: 1.07644e-05	valid_1's l1: 1.2453e-05
[150]	training's l1: 1.03712e-05	valid_1's l1: 1.23594e-05
[175]	training's l1: 1.00545e-05	valid_1's l1: 1.22813e-05
[200]	training's l1: 9.79113e-06	valid_1's l1: 1.22392e-05
[225]	training's l1: 9.56255e-06	valid_1's l1: 1.22118e-05
[250]	training's l1: 9.35036e-06	valid_1's l1: 1.2186e-05
[275]	training's l1: 9.15783e-06	valid_1's l1: 1.21741e-05
[300]	training's l1: 8.98909e-06	valid_1's l1: 1.21597e-

['model_lat.pkl']

In [12]:
params = {
    'objective': 'regression_l1',
    'max_bin': 600,
    'learning_rate': 0.01,
    'num_leaves': 80
}

lgb_train = lgb.Dataset(xtr, ytr['dis_lng'])
lgb_eval = lgb.Dataset(xval, yval['dis_lng'], reference=lgb_train)
model_lng = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=25,
    num_boost_round=10000,
    early_stopping_rounds=10
)

joblib.dump(model_lng, 'model_lng.pkl')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43472
[LightGBM] [Info] Number of data points in the train set: 68981, number of used features: 83
[LightGBM] [Info] Start training from score -0.000001
Training until validation scores don't improve for 10 rounds
[25]	training's l1: 1.63732e-05	valid_1's l1: 1.78855e-05
[50]	training's l1: 1.51353e-05	valid_1's l1: 1.7213e-05
[75]	training's l1: 1.41875e-05	valid_1's l1: 1.67605e-05
[100]	training's l1: 1.34259e-05	valid_1's l1: 1.6436e-05
[125]	training's l1: 1.27988e-05	valid_1's l1: 1.61772e-05
[150]	training's l1: 1.22976e-05	valid_1's l1: 1.59849e-05
[175]	training's l1: 1.18973e-05	valid_1's l1: 1.58413e-05
[200]	training's l1: 1.15667e-05	valid_1's l1: 1.57473e-05
[225]	training's l1: 1.12782e-05	valid_1's l1: 1.56746e-05
[250]	training's l1: 1.10176e-05	valid_1's l1: 1.56212e-05
[275]	training's l1: 1.07897e-05	valid_1's l1: 1.55833e-05
[300]	training's l1: 1.05871e-05	valid_1's l1: 1.55633e

['model_lng.pkl']

In [13]:
regression = test_data.loc[((test['pred_dist_between_baseline_and_truth'] <= th_dist_normal) & (test['pred_speed_between_baseline_and_truth'] <= th_speed_normal))].copy()

In [14]:
dis_lat = model_lat.predict(regression[feature_cols])
dis_lng = model_lng.predict(regression[feature_cols])

In [15]:
test_data.loc[((test['pred_dist_between_baseline_and_truth'] <= th_dist_normal) & (test['pred_speed_between_baseline_and_truth'] <= th_speed_normal)), 'latDeg'] -= dis_lat
test_data.loc[((test['pred_dist_between_baseline_and_truth'] <= th_dist_normal) & (test['pred_speed_between_baseline_and_truth'] <= th_speed_normal)), 'lngDeg'] -= dis_lng

In [16]:
kf = KF()

tmp = test_data.copy()

th_speed_abnormal = 16
th_dist_abnormal = 13

# th_speed_abnormal = th_speed_normal
# th_dist_abnormal = th_dist_normal

print(len(tmp.loc[((test['pred_dist_between_baseline_and_truth'] > th_dist_abnormal) | (test['pred_speed_between_baseline_and_truth'] > th_speed_abnormal)), :]))
tmp.loc[((test['pred_dist_between_baseline_and_truth'] > th_dist_abnormal) | (test['pred_speed_between_baseline_and_truth'] > th_speed_abnormal)), ['latDeg', 'lngDeg']] = np.nan

2605


In [17]:
kf.apply_kf_smoothing(tmp, "")
smoothed_tmp = mean_with_other_phones(tmp)

100%|██████████| 48/48 [00:44<00:00,  1.08it/s]


In [18]:
data_sub = pd.read_csv('../input/google-smartphone-decimeter-challenge' + '/' + 'sample_submission.csv')
df_sub = data_sub[['phone', 'millisSinceGpsEpoch']].merge(smoothed_tmp[['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']], on=['phone', 'millisSinceGpsEpoch'], how='inner')

In [19]:
print('tdn{}_tsn{}_tdan{}_tsan{}'.format(th_dist_normal, th_speed_normal, th_dist_abnormal, th_speed_abnormal))
df_sub.to_csv("./sub_lgb_after_ann_normal_tdn{}_tsn{}_tdan{}_tsan{}.csv".format(th_dist_normal, th_speed_normal, th_dist_abnormal, th_speed_abnormal), index=False)

tdn14.25_tsn7.125_tdan13_tsan16
