# Splitting the dataset 

### Import Libraries

In [1]:
import numpy as np
import pandas as pd

### Function to get the DataFrame for a specific driver and path order

In [3]:
def driver_dataframe(df, path_order, driver):
    """
    Extracts and processes the DataFrame for a specific driver and path order.
    
    Parameters:
    df (DataFrame): The original DataFrame.
    path_order (int or str): The path order identifier.
    driver (int or str): The driver class identifier.
    
    Returns:
    DataFrame: Processed DataFrame for the specified driver and path order.
    """
    driver_df = df[df['PathOrder'] == path_order]
    driver_df = driver_df[driver_df['Class'] == driver]
    driver_df = driver_df.sort_values(by='Time(s)')
    driver_df.drop(columns=['Class', 'PathOrder'], inplace=True)
    driver_df = driver_df.groupby('Time(s)').mean()
    return driver_df

### Function to split the dataset

In [4]:

def split_dataset(df, drop_columns, train_ratio=0.8, random_seed=123):
    """
    Splits the dataset into training and testing sets based on driver and path order.
    
    Parameters:
    df (DataFrame): The original DataFrame.
    drop_columns (list of str): List of columns to be dropped from the DataFrame.
    train_ratio (float): Ratio of the dataset to be used for training. Default is 0.8.
    random_seed (int): Seed for random number generator. Default is 123.
    
    Returns:
    DataFrame: Training set.
    DataFrame: Testing set.
    """
    
    np.random.seed(random_seed)

    # Drop unimportant columns
    df_copy = df.drop(columns=drop_columns, inplace=False)

    # Lists to hold individual DataFrames
    df_train = []
    df_test = []

    # Nested loops to process each path and driver
    for path in df_copy['PathOrder'].unique():
        for driver in df_copy['Class'].unique():
            driver_df = driver_dataframe(df_copy, path, driver)
            driver_df['PathOrder'] = path  # Add PathOrder
            driver_df['Class'] = driver    # Add Class

            # Calculate the index to split the data
            split_index = int(train_ratio * len(driver_df))

            # Split the data
            df_train_split = driver_df.iloc[:split_index]
            df_test_split = driver_df.iloc[split_index:]

            # Append the data to df_train and df_test lists
            df_train.append(df_train_split)
            df_test.append(df_test_split)

    # Concatenate the DataFrames
    df_train = pd.concat(df_train, ignore_index=True)
    df_test = pd.concat(df_test, ignore_index=True)

    return df_train, df_test

In [6]:
drop_columns = ['Wheel_velocity_rear_right-hand', 'Wheel_velocity_rear_left-hand', 'Wheel_velocity_front_right-hand',
                'Wheel_velocity_front_left-hand', 'Absolute_throttle_position', 'Flywheel_torque',
               'Flywheel_torque_(after_torque_interventions)', 'Torque_converter_turbine_speed_-_Unfiltered',
               'Filtered_Accelerator_Pedal_value', 'Inhibition_of_engine_fuel_cut_off', 'Fuel_Pressure', 
                'Torque_scaling_factor(standardization)', 'Standard_Torque_Ratio', 'Requested_spark_retard_angle_from_TCU',
                'Target_engine_speed_used_in_lock-up_module', 'Glow_plug_control_request']

In [7]:
data = pd.read_csv("../data/raw/Driving Data(KIA SOUL)_(150728-160714)_(10 Drivers_A-J).csv")

In [8]:
df_train, df_test = split_dataset(data, drop_columns)

In [9]:
df_train.shape

(37059, 37)

In [10]:
df_test.shape

(9277, 37)

In [11]:
df_train.head()

Unnamed: 0,Fuel_consumption,Accelerator_Pedal_value,Throttle_position_signal,Short_Term_Fuel_Trim_Bank1,Intake_air_pressure,Engine_soacking_time,Engine_in_fuel_cut_off,Long_Term_Fuel_Trim_Bank1,Engine_speed,Engine_torque_after_correction,...,Vehicle_speed,Acceleration_speed_-_Longitudinal,Indication_of_brake_switch_ON/OFF,Master_cylinder_pressure,Calculated_road_gradient,Acceleration_speed_-_Lateral,Steering_wheel_speed,Steering_wheel_angle,PathOrder,Class
0,595.2,0.0,7.55,0.0,55.0,129.0,0.0,-0.4,974.0,12.9,...,0.0,-4.25,1.0,166.8,0.0,-8.1,224.0,-0.9,1,A
1,537.6,0.0,6.6,0.0,39.0,129.0,0.0,-0.4,1161.5,6.05,...,0.0,-3.95,1.0,165.5,0.0,-4.2,0.0,-1.05,1,A
2,326.4,0.0,5.9,0.0,33.5,129.0,0.0,-0.4,1098.0,6.05,...,0.0,0.05,1.0,0.85,0.0,-0.2,0.0,-1.0,1,A
3,268.8,0.0,5.65,0.0,32.0,129.0,0.0,-0.4,995.0,6.65,...,0.0,0.05,1.0,0.9,0.0,-0.2,0.0,-1.0,1,A
4,275.2,0.0,6.15,0.0,34.5,129.0,0.0,-0.8,947.0,8.0,...,0.0,0.05,1.0,0.9,0.0,-0.2,0.0,-0.95,1,A
