# Feature Engineering
This notebook is to test adding new interesting features to the dataset
- Time-related features
- Distance-based features

In [None]:
import pandas as pd
import sys

sys.path.append('../')
from cool_train.feature_engineering import add_time_features

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Data Aready cleaned 
df_clean = pd.read_csv('../dataset/train_181/df_filter2.csv', index_col=[0])
df_clean

# Time related features

In [None]:
df_time = add_time_features(df_clean)
df_time

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

time_features = ['month','day','hour','minute','second','dayOfWeek',
                 'isWeekend','quarter','timeOfDay']

# Define bin sizes for each time-related feature
bin_sizes = {'month': 12, 'day': 31, 'hour': 24, 'minute': 60, 'second': 60,
             'dayOfWeek': 7, 'quarter': 4,'timeOfDay': 4,'isWeekend':2}

# Plot histograms for each time-related feature 
plt.figure(figsize=(15, 10))
for i, feature in enumerate(time_features, 1):
    plt.subplot(3, 4, i)
    if feature in ['isWeekend', 'quarter', 'timeOfDay','dayOfWeek','month']:
        # Use a bar plot for categorical variables
        sns.countplot(x=feature, data=df_time)
    else:
        # Use a histogram for numerical variables
        sns.histplot(df_time[feature], bins=bin_sizes[feature], kde=True)
    
    plt.title(f'Distribution of {feature}')

plt.tight_layout()
plt.show()

# GPS-related features
Lets now compute all the potential features we can think of using the GPS coordinates

In [None]:
from cool_train.feature_engineering import compute_spatial_features

In [None]:
df_space_time = compute_spatial_features(df_time)
df_space_time

In [None]:
import pandas as pd
import plotly.express as px


# Select range
start_date = pd.Timestamp('2023-01-01',tz='Europe/Brussels')
end_date = pd.Timestamp('2023-03-01',tz='Europe/Brussels')

# Define a threshold for acceptable speed values
speed_threshold = 0.1

# Filter out aberrant speed values
filtered_df = df_space_time[df_space_time['Speed'] <= speed_threshold]

# Apply rolling window for smoothing\
window_size = '10T'
filtered_df = filtered_df.set_index('timestamps_UTC', drop=False)
smoothed_df = filtered_df.rolling(window_size).mean()


# Filter rows between the specified dates
df_to_plot = smoothed_df[(smoothed_df.index >= start_date) & (smoothed_df.index <= end_date)]


# Plotly line plot for speed over time
fig_speed = px.line(df_to_plot, x=df_to_plot.index, y='Speed', title='Speed Over Time')
fig_speed.update_xaxes(title_text='Time')
fig_speed.update_yaxes(title_text='Speed (km/h)')

# Plotly line plot for heading over time
fig_heading = px.line(df_to_plot, x=df_to_plot.index, y='Heading', title='Heading Over Time')
fig_heading.update_xaxes(title_text='Time')
fig_heading.update_yaxes(title_text='Heading (degrees)')

# Show the plots
fig_speed.show()
fig_heading.show()