# Splitting the dataset 

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

### Function to get the DataFrame for a specific driver and path order

In [2]:
def driver_dataframe(df, path_order, driver):
    """
    Extracts and processes the DataFrame for a specific driver and path order.
    
    Parameters:
    df (DataFrame): The original DataFrame.
    path_order (int): The path order identifier.
    driver (str): The driver class identifier.
    
    Returns:
    DataFrame: Processed DataFrame for the specified driver and path order.
    """
    driver_df = df[df['PathOrder'] == path_order]
    driver_df = driver_df[driver_df['Class'] == driver]
    driver_df = driver_df.sort_values(by='Time(s)')
    driver_df.drop(columns=['Class', 'PathOrder'], inplace=True)
    driver_df = driver_df.groupby('Time(s)').mean()
    return driver_df

### Function to split the dataset

In [3]:
def split_dataset(df, train_ratio=0.8, random_seed=123):
    """
    Splits the dataset into training and testing sets based on driver and path order.
    
    Parameters:
    df (DataFrame): The original DataFrame.
    train_ratio (float): Ratio of the dataset to be used for training. Default is 0.8.
    random_seed (int): Seed for random number generator. Default is 123.
    
    Returns:
    DataFrame: Training set.
    DataFrame: Testing set.
    """
    
    # set a random seed 
    np.random.seed(random_seed)
    
    # create a copy of dataframe
    df_copy = df.copy()

    # Lists to hold individual DataFrames
    df_train = []
    df_test = []

    # Nested loops to process each path and driver
    for path in df_copy['PathOrder'].unique():
        for driver in df_copy['Class'].unique():
            driver_df = driver_dataframe(df_copy, path, driver)
            driver_df['PathOrder'] = path  # Add PathOrder
            driver_df['Class'] = driver    # Add Class

            # Calculate the index to split the data
            split_index = int(train_ratio * len(driver_df))

            # Split the data
            df_train_split = driver_df.iloc[:split_index]
            df_test_split = driver_df.iloc[split_index:]

            # Append the data to df_train and df_test lists
            df_train.append(df_train_split)
            df_test.append(df_test_split)

    # Concatenate the DataFrames
    df_train = pd.concat(df_train, ignore_index=True)
    df_test = pd.concat(df_test, ignore_index=True)

    return df_train, df_test

In [4]:
selected_features = ['Engine_soacking_time', 'Long_Term_Fuel_Trim_Bank1', 'Engine_coolant_temperature.1',
 'Torque_of_friction', 'Activation_of_Air_compressor', 'Intake_air_pressure', 'PathOrder', 'Accelerator_Pedal_value',
 'Master_cylinder_pressure', 'Vehicle_speed','Engine_coolant_temperature', 'Maximum_indicated_engine_torque',
 'Throttle_position_signal', 'Current_Gear', 'Calculated_LOAD_value', 'Engine_torque_after_correction',
 'Engine_torque', 'Class', 'Time(s)']

In [5]:
data = pd.read_csv("../data/raw/Driving Data(KIA SOUL)_(150728-160714)_(10 Drivers_A-J).csv")

In [6]:
selected_df = data[selected_features]

In [7]:
selected_df.head()

Unnamed: 0,Engine_soacking_time,Long_Term_Fuel_Trim_Bank1,Engine_coolant_temperature.1,Torque_of_friction,Activation_of_Air_compressor,Intake_air_pressure,PathOrder,Accelerator_Pedal_value,Master_cylinder_pressure,Vehicle_speed,Engine_coolant_temperature,Maximum_indicated_engine_torque,Throttle_position_signal,Current_Gear,Calculated_LOAD_value,Engine_torque_after_correction,Engine_torque,Class,Time(s)
0,3,-0.8,87,9.0,0,33,1,0.0,325.5,0,95,58.2,5.2,0,23.9,5.5,5.5,A,1
1,3,-0.8,87,7.8,0,40,1,0.0,0.9,0,95,53.9,6.1,0,30.6,7.0,7.0,A,2
2,3,-0.8,87,7.4,0,41,1,0.0,0.9,0,95,53.1,5.2,0,31.8,7.0,7.0,A,3
3,3,-0.8,87,6.6,0,38,1,0.0,0.9,0,95,53.1,4.7,0,29.0,7.0,7.0,A,4
4,3,-0.8,87,11.3,1,40,1,0.0,0.9,0,95,53.5,5.7,0,30.2,8.2,8.2,A,5


In [8]:
df_train, df_test = split_dataset(selected_df, 0.8)

In [9]:
df_train.shape

(37059, 18)

In [10]:
df_test.shape

(9277, 18)

In [11]:

def driver_dataframe_using_window_size(df, path_order, driver, window_size=60):
    """
    Extracts and processes the DataFrame for a specific driver and path order,
    and calculates rolling statistics with the specified window size.
    
    Parameters:
    df (DataFrame): The original DataFrame.
    path_order (int): The path order identifier.
    driver (str): The driver class identifier.
    window_size (int): The size of the rolling window.
    
    Returns:
    DataFrame: Processed DataFrame for the specified driver and path order with
               rolling statistics concatenated to the original features.
    """
    
    # Filter the DataFrame for the specific driver and path order
    driver_df = df[df['PathOrder'] == path_order]
    driver_df = driver_df[driver_df['Class'] == driver]
 
    
    # Drop the 'Class' and 'PathOrder' columns
    driver_df.drop(columns=['Class', 'PathOrder'], inplace=True)
    

    # Calculate rolling statistics
    rolling_mean = driver_df.rolling(window=window_size).mean().add_suffix('_mean')
    rolling_median = driver_df.rolling(window=window_size).median().add_suffix('_median')
    rolling_std = driver_df.rolling(window=window_size).std().add_suffix('_std')
#     rolling_var = driver_df.rolling(window=window_size).var().add_suffix('_var')
#     rolling_min = driver_df.rolling(window=window_size).min().add_suffix('_min')
#     rolling_max = driver_df.rolling(window=window_size).max().add_suffix('_max')
    
    # Concatenate the original features with the rolling statistics
    # , rolling_var, rolling_min, rolling_max
    driver_df = pd.concat([rolling_mean, rolling_median, rolling_std], axis=1)
    # Drop the first 'window_size' rows to make the concatenation match
    driver_df = driver_df.iloc[window_size-1:]
    
    return driver_df


In [12]:
def prepare_all_drivers(df, random_seed=123):
    """
    Prepares the dataset by processing each driver's data and calculating rolling statistics for each feature.
    
    Parameters:
    df (DataFrame): The original DataFrame.
    random_seed (int): The seed for the random number generator (default is 123).
    
    Returns:
    DataFrame: Processed DataFrame with rolling statistics and necessary columns for all drivers.
    """
    
    # Set the random seed for reproducibility
    np.random.seed(random_seed)
    
    # Create a copy of the original DataFrame to ensure the original data is not altered
    df_copy = df.copy()

    # List to hold individual DataFrames for each driver and path
    df_temp = []

    # Nested loops to process each path and driver
    for path in df_copy['PathOrder'].unique():
        for driver in df_copy['Class'].unique():
            # Process the DataFrame for the specific driver and path using window size
            driver_df = driver_dataframe_using_window_size(df_copy, path, driver)
            driver_df['PathOrder'] = path  # Add PathOrder column
            driver_df['Class'] = driver    # Add Class column

            # Append the processed DataFrame to the list
            df_temp.append(driver_df)

    # Concatenate all individual DataFrames into a single DataFrame
    all_drivers = pd.concat(df_temp, ignore_index=True)

    return all_drivers

In [13]:
df_train_preprocessed = prepare_all_drivers(df_train)
df_test_preprocessed = prepare_all_drivers(df_test)

In [14]:
df_train_preprocessed.head()

Unnamed: 0,Engine_soacking_time_mean,Long_Term_Fuel_Trim_Bank1_mean,Engine_coolant_temperature.1_mean,Torque_of_friction_mean,Activation_of_Air_compressor_mean,Intake_air_pressure_mean,Accelerator_Pedal_value_mean,Master_cylinder_pressure_mean,Vehicle_speed_mean,Engine_coolant_temperature_mean,Maximum_indicated_engine_torque_mean,Throttle_position_signal_mean,Current_Gear_mean,Calculated_LOAD_value_mean,Engine_torque_after_correction_mean,Engine_torque_mean,Engine_soacking_time_median,Long_Term_Fuel_Trim_Bank1_median,Engine_coolant_temperature.1_median,Torque_of_friction_median,Activation_of_Air_compressor_median,Intake_air_pressure_median,Accelerator_Pedal_value_median,Master_cylinder_pressure_median,Vehicle_speed_median,Engine_coolant_temperature_median,Maximum_indicated_engine_torque_median,Throttle_position_signal_median,Current_Gear_median,Calculated_LOAD_value_median,Engine_torque_after_correction_median,Engine_torque_median,Engine_soacking_time_std,Long_Term_Fuel_Trim_Bank1_std,Engine_coolant_temperature.1_std,Torque_of_friction_std,Activation_of_Air_compressor_std,Intake_air_pressure_std,Accelerator_Pedal_value_std,Master_cylinder_pressure_std,Vehicle_speed_std,Engine_coolant_temperature_std,Maximum_indicated_engine_torque_std,Throttle_position_signal_std,Current_Gear_std,Calculated_LOAD_value_std,Engine_torque_after_correction_std,Engine_torque_std,PathOrder,Class
0,129.0,-0.073333,59.733333,19.954167,0.933333,59.125,1.5625,11.056667,3.925,61.666667,63.3025,8.945,5.566667,48.15,25.255833,25.253333,129.0,0.0,59.5,20.5,1.0,59.5,0.0,1.95,2.0,61.5,61.15,8.95,7.0,48.425,26.65,26.55,0.0,0.289867,0.384737,2.605979,0.251549,10.812197,2.847756,30.029674,5.114639,0.447845,4.325598,1.904382,2.80808,10.501307,7.898869,7.977537,1,A
1,129.0,-0.066667,59.758333,20.075,0.941667,58.983333,1.569167,8.300833,4.191667,61.658333,63.348333,8.9525,5.608333,48.016667,25.395833,25.435833,129.0,0.0,59.5,20.5,1.0,59.5,0.0,1.8,2.0,61.5,61.15,8.95,7.0,48.425,26.65,26.55,0.0,0.28681,0.416723,2.425771,0.22721,10.922286,2.844502,22.011446,5.319529,0.436586,4.343873,1.899675,2.741847,10.60133,7.749184,7.752906,1,A
2,129.0,-0.06,59.783333,20.166667,0.95,59.091667,1.601667,5.559167,4.441667,61.65,63.354167,8.975833,5.65,48.1175,25.636667,25.676667,129.0,0.0,59.5,20.5,1.0,59.5,0.0,1.7,2.0,61.5,61.15,8.95,7.0,48.425,26.65,26.55,0.0,0.283561,0.444997,2.197064,0.199576,10.751623,2.837401,7.676116,5.469555,0.424863,4.346977,1.878773,2.673314,10.445499,7.352123,7.354714,1,A
3,129.0,-0.06,59.808333,20.251667,0.958333,59.466667,1.741667,5.561667,4.683333,61.65,63.403333,9.073333,5.691667,48.483333,26.005,26.041667,129.0,0.0,59.5,20.5,1.0,59.5,0.0,1.7,2.0,61.5,61.15,8.95,7.0,48.425,26.85,26.75,0.0,0.283561,0.47023,1.930728,0.167019,10.223298,2.96155,7.67458,5.589008,0.424863,4.376167,1.86821,2.6023,9.965192,6.89351,6.893114,1,A
4,129.0,-0.06,59.833333,20.34,0.966667,60.05,1.920833,5.565833,4.925,61.65,63.5075,9.523333,5.733333,49.094167,26.4475,26.484167,129.0,0.0,59.5,20.5,1.0,61.0,0.0,1.7,2.25,61.5,61.15,8.95,7.0,49.675,26.95,27.15,0.0,0.283561,0.492887,1.58611,0.125774,9.609626,3.172088,7.672074,5.695542,0.424863,4.437932,3.536522,2.528594,9.472154,6.469108,6.466134,1,A


In [15]:
df_train_preprocessed.shape

(35879, 50)

In [16]:
df_test_preprocessed.shape

(8097, 50)

In [17]:
df_train_preprocessed.to_csv('../data/preprocessed/train_preprocessed.csv')
df_test_preprocessed.to_csv('../data/preprocessed/test_preprocessed.csv')

In [18]:
df_train_preprocessed.head()

Unnamed: 0,Engine_soacking_time_mean,Long_Term_Fuel_Trim_Bank1_mean,Engine_coolant_temperature.1_mean,Torque_of_friction_mean,Activation_of_Air_compressor_mean,Intake_air_pressure_mean,Accelerator_Pedal_value_mean,Master_cylinder_pressure_mean,Vehicle_speed_mean,Engine_coolant_temperature_mean,Maximum_indicated_engine_torque_mean,Throttle_position_signal_mean,Current_Gear_mean,Calculated_LOAD_value_mean,Engine_torque_after_correction_mean,Engine_torque_mean,Engine_soacking_time_median,Long_Term_Fuel_Trim_Bank1_median,Engine_coolant_temperature.1_median,Torque_of_friction_median,Activation_of_Air_compressor_median,Intake_air_pressure_median,Accelerator_Pedal_value_median,Master_cylinder_pressure_median,Vehicle_speed_median,Engine_coolant_temperature_median,Maximum_indicated_engine_torque_median,Throttle_position_signal_median,Current_Gear_median,Calculated_LOAD_value_median,Engine_torque_after_correction_median,Engine_torque_median,Engine_soacking_time_std,Long_Term_Fuel_Trim_Bank1_std,Engine_coolant_temperature.1_std,Torque_of_friction_std,Activation_of_Air_compressor_std,Intake_air_pressure_std,Accelerator_Pedal_value_std,Master_cylinder_pressure_std,Vehicle_speed_std,Engine_coolant_temperature_std,Maximum_indicated_engine_torque_std,Throttle_position_signal_std,Current_Gear_std,Calculated_LOAD_value_std,Engine_torque_after_correction_std,Engine_torque_std,PathOrder,Class
0,129.0,-0.073333,59.733333,19.954167,0.933333,59.125,1.5625,11.056667,3.925,61.666667,63.3025,8.945,5.566667,48.15,25.255833,25.253333,129.0,0.0,59.5,20.5,1.0,59.5,0.0,1.95,2.0,61.5,61.15,8.95,7.0,48.425,26.65,26.55,0.0,0.289867,0.384737,2.605979,0.251549,10.812197,2.847756,30.029674,5.114639,0.447845,4.325598,1.904382,2.80808,10.501307,7.898869,7.977537,1,A
1,129.0,-0.066667,59.758333,20.075,0.941667,58.983333,1.569167,8.300833,4.191667,61.658333,63.348333,8.9525,5.608333,48.016667,25.395833,25.435833,129.0,0.0,59.5,20.5,1.0,59.5,0.0,1.8,2.0,61.5,61.15,8.95,7.0,48.425,26.65,26.55,0.0,0.28681,0.416723,2.425771,0.22721,10.922286,2.844502,22.011446,5.319529,0.436586,4.343873,1.899675,2.741847,10.60133,7.749184,7.752906,1,A
2,129.0,-0.06,59.783333,20.166667,0.95,59.091667,1.601667,5.559167,4.441667,61.65,63.354167,8.975833,5.65,48.1175,25.636667,25.676667,129.0,0.0,59.5,20.5,1.0,59.5,0.0,1.7,2.0,61.5,61.15,8.95,7.0,48.425,26.65,26.55,0.0,0.283561,0.444997,2.197064,0.199576,10.751623,2.837401,7.676116,5.469555,0.424863,4.346977,1.878773,2.673314,10.445499,7.352123,7.354714,1,A
3,129.0,-0.06,59.808333,20.251667,0.958333,59.466667,1.741667,5.561667,4.683333,61.65,63.403333,9.073333,5.691667,48.483333,26.005,26.041667,129.0,0.0,59.5,20.5,1.0,59.5,0.0,1.7,2.0,61.5,61.15,8.95,7.0,48.425,26.85,26.75,0.0,0.283561,0.47023,1.930728,0.167019,10.223298,2.96155,7.67458,5.589008,0.424863,4.376167,1.86821,2.6023,9.965192,6.89351,6.893114,1,A
4,129.0,-0.06,59.833333,20.34,0.966667,60.05,1.920833,5.565833,4.925,61.65,63.5075,9.523333,5.733333,49.094167,26.4475,26.484167,129.0,0.0,59.5,20.5,1.0,61.0,0.0,1.7,2.25,61.5,61.15,8.95,7.0,49.675,26.95,27.15,0.0,0.283561,0.492887,1.58611,0.125774,9.609626,3.172088,7.672074,5.695542,0.424863,4.437932,3.536522,2.528594,9.472154,6.469108,6.466134,1,A
