In [212]:
import pandas as pd
import numpy as np
import datetime

In [223]:
file_path = "../../queenCsvOut.csv"
df = pd.read_csv(file_path, skiprows=[1])

In [224]:
# drop rows that have more than 20 null values
# drop column cargo and cargo_pax
df.dropna(axis=0, thresh=20, inplace=True)
df.drop(columns = ["CARGO", "CARGO_PAX"], inplace=True)

In [215]:
df.columns, len(df.columns)

(Index(['Dati', 'Time', 'DEPTH', 'ENGINE_1_FLOWRATE', 'ENGINE_1_FLOWRATEA',
        'ENGINE_1_FLOWRATEB', 'ENGINE_1_FLOWTEMPA', 'ENGINE_1_FLOWTEMPB',
        'ENGINE_1_FUEL_CONSUMPTION', 'ENGINE_2_FLOWRATE', 'ENGINE_2_FLOWRATEA',
        'ENGINE_2_FLOWRATEB', 'ENGINE_2_FLOWTEMPA', 'ENGINE_2_FLOWTEMPB',
        'ENGINE_2_FUEL_CONSUMPTION', 'HEADING', 'LATITUDE', 'LONGITUDE', 'PAX',
        'PITCH_1', 'PITCH_2', 'POWER_1', 'POWER_2', 'RATE_OF_TURN', 'SOG',
        'SOG_SPEEDLOG_LONG', 'SOG_SPEEDLOG_TRANS', 'SPEED_1', 'SPEED_2', 'STW',
        'THRUST_1', 'THRUST_2', 'TORQUE_1', 'TORQUE_2', 'TRACK_MADE_GOOD',
        'WIND_ANGLE', 'WIND_SPEED', 'WIND_ANGLE_TRUE', 'WIND_SPEED_TRUE'],
       dtype='object'),
 39)

In [40]:
# trip id to identify which trip the records belong to
# df["trip_id"] = df["Dati"].apply(lambda x: x.split("_")[0]).astype(str)
# df = df[[df.columns[-1]] + list(df.columns[:-1])]
# df.trip_id.nunique()

In [42]:
# naive mode definition
# TODO: add more constraints to get a more accurate mode
df["SPEED_1"] = df["SPEED_1"].astype(float)
df["SPEED_2"] = df["SPEED_2"].astype(float)

# MODE for operation mode: 1 for mode1, 2 for mode2, 0 for unknown(speed with nan)
def naive_operation_mode(row):
    if pd.isna(row['SPEED_1']) or (pd.isna(row["SPEED_2"])):
        return 0
    elif ((row["SPEED_1"]<1) ^ (row["SPEED_2"]<1)):
        return 1
    else:
        return 2

df["MODE"] = df.apply(naive_operation_mode, axis=1)

In [43]:
df.groupby("MODE").count()

Unnamed: 0_level_0,Dati,Time,CARGO,CARGO_PAX,DEPTH,ENGINE_1_FLOWRATE,ENGINE_1_FLOWRATEA,ENGINE_1_FLOWRATEB,ENGINE_1_FLOWTEMPA,ENGINE_1_FLOWTEMPB,...,STW,THRUST_1,THRUST_2,TORQUE_1,TORQUE_2,TRACK_MADE_GOOD,WIND_ANGLE,WIND_SPEED,WIND_ANGLE_TRUE,WIND_SPEED_TRUE
MODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,262909,262713,5,5,0,0,0,0,0,0,...,0,0,0,0,0,5,0,0,0,0
1,381318,381318,373160,373160,381318,381318,381318,381318,381318,381318,...,381318,381318,381318,381318,381318,373160,381318,381318,380702,380702
2,454627,454627,450222,450222,454627,454627,454627,454627,454627,454627,...,454627,454627,454627,454627,454627,450222,454627,454627,406502,406527


In [225]:
# from the Dati variable, get the corresponding season code, hour, and day of the week
def get_dt_info(minute, starting_dt):
    dt = starting_dt + datetime.timedelta(minutes=minute) # days, seconds, then other fields.
    month = dt.month
    season = get_season(month)
    hour = dt.hour
    weekday = dt.weekday()
    return dt, season, hour, weekday

# get season code from corresponding month:
# spring: 0, summer: 1, fall: 2, winter: 3
def get_season(month):
    if month <= 3:
        return 3
    elif month <=6:
        return 0
    elif month <=9:
        return 1
    else:
        return 2

# convert Dati to python datetime format
# df["datetime"] = pd.to_datetime(df.Dati, format='%y%m%d_%H%M%S')
starting_dt = pd.to_datetime(df.iloc[0].Dati, format='%y%m%d_%H%M%S')
df["datetime"],df["season"], df["hour"], df["weekday"] = zip(*df["Time"].apply(get_dt_info, starting_dt = starting_dt))
df[["datetime", "season", "hour", "weekday"]]

Unnamed: 0,datetime,season,hour,weekday
0,2019-08-30 00:00:00,1,0,4
1,2019-08-30 00:01:00,1,0,4
2,2019-08-30 00:02:00,1,0,4
3,2019-08-30 00:03:00,1,0,4
4,2019-08-30 00:04:00,1,0,4
...,...,...,...,...
1098653,2021-09-30 22:53:00,1,22,3
1098654,2021-09-30 22:54:00,1,22,3
1098655,2021-09-30 22:55:00,1,22,3
1098656,2021-09-30 22:56:00,1,22,3


In [163]:
# current: difference between STW and SOG
df["current"] = df["STW"] - df["SOG"]
df.current

0          0.0000
1          0.0166
2          0.0317
3          0.0384
4         -0.0950
            ...  
1098653    0.1667
1098654    0.0834
1098655    0.1217
1098656    0.2166
1098657    0.3850
Name: current, Length: 835945, dtype: float64

In [164]:
# mean longitude and lattitude
mean_long = df["LONGITUDE"].mean()
mean_lat = df["LATITUDE"].mean()
mean_long, mean_lat

(-123.46494295180521, 49.315970106145976)

In [178]:
df['LONGITUDE'].value_counts()

-123.2715    260538
-123.2714     94259
-123.9554     62250
-123.9547     10172
-123.2707      7788
              ...  
-70.4409          1
-68.2396          1
-66.0383          1
-63.8370          1
-18.9718          1
Name: LONGITUDE, Length: 7072, dtype: int64

In [180]:
df['LATITUDE'].value_counts()

49.3771    349193
49.1936     55011
49.3773     13532
49.1938     10318
49.4316      6778
            ...  
49.0760         1
49.0780         1
49.0820         1
49.0840         1
49.1934         1
Name: LATITUDE, Length: 2601, dtype: int64

In [183]:
# Horseshoe Bay
# 49.3771, -123.2715
df["LONGITUDE"].mode(), df["LATITUDE"].mode()
H_lat = 49.3771
H_long = -123.2715

# Nanaimo 
# 49.1936, -123.9554
N_lat = 49.1936
N_long = -123.9554

In [194]:
# 0: H, 1: N, 2: H-N, 3: N-H
def get_direction():
    direc = np.zeros(df.shape[0])
    direc[0] = 0
    for i in range(1, df.shape[0]):
        if (df.iloc[i].LONGITUDE == H_long) and (df.iloc[i].LATITUDE == H_lat) and (df.iloc[i].SOG < 0.1):
            direc[i]=0
        elif (df.iloc[i].LONGITUDE == N_long) and (df.iloc[i].LATITUDE == N_lat) and (df.iloc[i].SOG < 0.1):
            direc[i]=1
        elif direc[i-1] == 0:
            direc[i]=2
        elif direc[i-1] == 1:
            direc[i]=3
        else:
            direc[i] = (direc[i-1])
    return direc
direcs = get_direction()

In [195]:
df["direction"] = direcs
df["direction"].value_counts()

2.0    311066
0.0    257966
3.0    211903
1.0     55010
Name: direction, dtype: int64

In [165]:
# weather related features
# temperature, humidity, pressure, precipitation, rain, snowfall
# weathercode: uses WMO weather codes

weather = pd.read_csv("weather.csv")
weather.columns = ["time", "temperature", "humidity", "pressure", "precipitation", "rain", "snowfall", "weathercode"]
weather["time"] = pd.to_datetime(weather["time"], format='%Y-%m-%dT%H:%M')
weather["day"] = weather.time.apply(lambda x: x.date())
weather["hour"] = weather.time.apply(lambda x: x.hour)

df["day"] = df.datetime.apply(lambda x: x.date())
df["hour"] = df.datetime.apply(lambda x: x.hour)
df = pd.merge(df, weather, on=["day", "hour"], how="left").drop(["day", "hour", "time"], axis=1)
df.weathercode = df.weathercode.apply(lambda x: int(x.strip("\\")))

In [166]:
df.weathercode.unique()

array([ 2,  3, 51,  1, 53, 63,  0, 55, 61, 71, 73, 75])

In [167]:
def get_wind_direction(angle):
    angle = np.abs(angle)
    if angle <= 60:
        return 0
    elif angle <= 120:
        return 1
    elif angle <= 180:
        return 2
    else:
        return np.nan
    
df["wind_force"] = df["WIND_SPEED_TRUE"] ** 2
df["wind_direc"] = df["WIND_ANGLE_TRUE"].apply(get_wind_direction)

In [196]:
holidays = []
holidays.append(datetime.datetime(2019, 9, 2))
holidays.append(datetime.datetime(2019, 10, 14))
holidays.append(datetime.datetime(2019, 11, 11))
holidays.append(datetime.datetime(2019, 12, 25))
holidays.append(datetime.datetime(2020, 1, 1))
holidays.append(datetime.datetime(2020, 2, 17))
holidays.append(datetime.datetime(2020, 4, 10))
holidays.append(datetime.datetime(2020, 5, 18))
holidays.append(datetime.datetime(2020, 7, 1))
holidays.append(datetime.datetime(2020, 8, 3))
holidays.append(datetime.datetime(2020, 9, 7))
holidays.append(datetime.datetime(2020, 10, 12))
holidays.append(datetime.datetime(2020, 11, 11))
holidays.append(datetime.datetime(2020, 12, 25))
holidays.append(datetime.datetime(2021, 1, 1))
holidays.append(datetime.datetime(2021, 2, 15))
holidays.append(datetime.datetime(2021, 4, 2))
holidays.append(datetime.datetime(2021, 5, 24))
holidays.append(datetime.datetime(2021, 7, 1))
holidays.append(datetime.datetime(2021, 8, 2))
holidays.append(datetime.datetime(2021, 9, 6))
is_holiday = [1 for i in range(len(holidays))]
holidays = pd.DataFrame({"date":holidays, "holiday":is_holiday})
holidays["date"] = holidays["date"].apply(lambda x: x.date())

In [227]:
df["date"] = df.datetime.apply(lambda x: x.date())
df = pd.merge(df, holidays, on="date", how="left")
df["holiday"] = df["holiday"].fillna(0)
df["is_weekday"] = (df["weekday"]>0) & (df["weekday"]<=5) & (~(df["holiday"]==1))