In this notebook, we impute missing / outlier data and seperate normal and adversarial trips

In [139]:
import pandas as pd
import numpy as np
from pykalman import KalmanFilter

In [111]:
df = pd.read_csv("../../df_cleaned_location.csv")

In [112]:
df.drop(["CARGO", "CARGO_PAX", "PAX", 'TRACK_MADE_GOOD','ENGINE_1_FLOWRATEB', 'ENGINE_1_FLOWTEMPB','ENGINE_2_FLOWRATEB','ENGINE_2_FLOWTEMPB'], inplace=True, axis=1)

In [113]:
df["WIND_ANGLE"] = df["WIND_ANGLE"].apply(lambda x: x-360 if x>360 else x) 
df = df[df.trip_id!=0]

In [114]:
cols = list(df.columns)
remove_list = ["Dati", "Time", "HEADING", "LONGITUDE", "LATITUDE", "WIND_ANGLE", "WIND_ANGLE_TRUE", "WIND_SPEED",
               "trip_id", "DEPTH", "PITCH_1", "PITCH_2"]
for col in remove_list:
    cols.remove(col)

In [115]:
for col in cols:
    q1 = df[col].quantile(.25)
    q3 = df[col].quantile(.75)
    IQR = q3 - q1
    lower = q1 - abs(1.5 * IQR)
    upper = q3 + abs(1.5 * IQR)
    outlier_count = df[(df[col]<lower) | (df[col]>upper)].Dati.count()
    if outlier_count > 0:
        print(col)
        print(outlier_count)
        print("\n")

ENGINE_1_FLOWRATE
15099


ENGINE_1_FLOWRATEA
13745


ENGINE_1_FLOWTEMPA
2


ENGINE_1_FUEL_CONSUMPTION
15173


ENGINE_2_FLOWRATE
14339


ENGINE_2_FLOWRATEA
8528


ENGINE_2_FLOWTEMPA
2


ENGINE_2_FUEL_CONSUMPTION
14298


RATE_OF_TURN
83064


SOG
43729


SOG_SPEEDLOG_TRANS
21373


SPEED_1
1


STW
43057


WIND_SPEED_TRUE
10




### flow temp
Low flow temp happens only when SOG is low, and at a time where outside temperature could be low
Seems reasonable

In [116]:
col = "ENGINE_1_FLOWTEMPA"
q1 = df[col].quantile(.25)
q3 = df[col].quantile(.75)
IQR = q3 - q1
lower = q1 - abs(1.5 * IQR)
upper = q3 + abs(1.5 * IQR)
tmp = df[(df[col]<lower) | (df[col]>upper)][["ENGINE_1_FLOWTEMPA", "ENGINE_2_FLOWTEMPA", "Dati", "SOG"]]
tmp[tmp.SOG > 1]

Unnamed: 0,ENGINE_1_FLOWTEMPA,ENGINE_2_FLOWTEMPA,Dati,SOG
489390,3.6667,3.6667,210214_162700,3.4283
532162,10.89,10.845,210325_000100,8.91


### POWER_2
observing from visluazation, the range for power 1 and 2 is similar, 
but power 2 has a lot of outliers. \
This is because power 2 has more 0's,
(i.e. engine 2 is used less than engine 1).\
No need to remove these outliers

### RATE_OF_TURN
outliers in rate of turn is also caused by 0 values.

### SOG
outliers are lower SOGs, these low speed values are reasonable to have.

In [118]:
col = "SOG"
q1 = df[col].quantile(.25)
q3 = df[col].quantile(.75)
IQR = q3 - q1
lower = q1 - abs(1.5 * IQR)
upper = q3 + abs(1.5 * IQR)
print(lower, upper)
print(df.SOG.min(), df.SOG.max())

16.216749999999998 21.62995
0.0833 21.875


### SOG_SPEEDLOG_TRANS
the extreme values in SOG_SPEEDLOG_TRANS seems reasonable when comparing to corresponding SOG and SOG_SPEEDLOG_LONG

In [126]:
col = "SOG_SPEEDLOG_TRANS"
q1 = df[col].quantile(.25)
q3 = df[col].quantile(.75)
IQR = q3 - q1
lower = q1 - abs(1.5 * IQR)
upper = q3 + abs(1.5 * IQR)
print(lower, upper)
print(df.SOG_SPEEDLOG_TRANS.min(), df.SOG_SPEEDLOG_TRANS.max())
tmp["computed_SOG"] = df.SOG_SPEEDLOG_TRANS**2 + df.SOG_SPEEDLOG_LONG**2
tmp["SOG_squred"] = df.SOG ** 2
tmp[(tmp.computed_SOG - tmp.SOG_squred)>3]

-1.11925 1.24755
-4.3033 4.7467


Unnamed: 0,SOG,SOG_SPEEDLOG_TRANS,SOG_SPEEDLOG_LONG,computed_SOG,SOG_squred


### SPEED_1
the extreme values in speed_1 seems unreasonable when comparing to related fields
remove them  and impute later.

In [129]:
col = "SPEED_1"
q1 = df[col].quantile(.25)
q3 = df[col].quantile(.75)
IQR = q3 - q1
lower = q1 - abs(1.5 * IQR)
upper = q3 + abs(1.5 * IQR)
tmp = df[(df[col]<lower) | (df[col]>upper)][["SPEED_1", "SPEED_2", "POWER_1", "POWER_2"]]
tmp

Unnamed: 0,SPEED_1,SPEED_2,POWER_1,POWER_2
168303,-1750.5303,915.1393,18.9395,1154.2505


In [132]:
df.loc[168303, "SPEED_1"] = np.nan

### STW
Similar to SOG, outliers are lower STWs, these low speed values are reasonable to have.

In [135]:
col = "STW"
q1 = df[col].quantile(.25)
q3 = df[col].quantile(.75)
IQR = q3 - q1
lower = q1 - abs(1.5 * IQR)
upper = q3 + abs(1.5 * IQR)
print(lower, upper)
print(df.STW.min(), df.STW.max())

15.958200000000003 22.0518
0.0367 27.72


### WIND_SPEED_TRUE

In [136]:
col = "WIND_SPEED_TRUE"
q1 = df[col].quantile(.25)
q3 = df[col].quantile(.75)
IQR = q3 - q1
lower = q1 - abs(1.5 * IQR)
upper = q3 + abs(1.5 * IQR)
tmp = df[(df[col]<lower) | (df[col]>upper)][["WIND_SPEED", "WIND_SPEED_TRUE", "WIND_ANGLE", "WIND_ANGLE_TRUE"]]
tmp

Unnamed: 0,WIND_SPEED,WIND_SPEED_TRUE,WIND_ANGLE,WIND_ANGLE_TRUE
45146,57.2433,76.171,8.0,85.8333
53774,57.3625,76.5931,7.0,76.0359
144429,62.4653,80.613,8.0,81.0367
333417,59.764,78.0139,359.0,32.0439
339833,61.3587,77.7006,36.0,-73.9282
404834,65.9267,83.8146,18.0,87.3742
410082,60.5967,78.6006,355.0,64.0122
493413,64.6753,81.5418,21.0,-86.275
605420,61.254,77.0383,31.0,-77.2211
727370,59.3053,77.0326,21.0,88.4293


In [138]:
outlier_indexes = list(tmp.index)
for index in outlier_indexes:
    df.loc[index, "WIND_SPEED"] = np.nan
    df.loc[index, "WIND_SPEED_TRUE"] = np.nan

In [140]:
df = df.to_csv("../../df_outlier_removed.csv", index=False)

In [141]:
# kalman filter to impute missing values
def impute_missing_values(data, transition_matrices, observation_matrices, transition_covariance,
                          observation_covariance, initial_state_mean, initial_state_covariance):
    kf = KalmanFilter(transition_matrices=transition_matrices,
                      observation_matrices=observation_matrices,
                      transition_covariance=transition_covariance,
                      observation_covariance=observation_covariance,
                      initial_state_mean=initial_state_mean,
                      initial_state_covariance=initial_state_covariance)
    
    # Create a mask indicating missing values
    mask = np.isnan(data)
    filtered_state_means, _ = kf.filter(data)
    
    # Replace missing values with imputed values
    imputed_data = data.copy()
    imputed_data[mask] = filtered_state_means[mask]
    
    return imputed_data