In [1]:
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight

from tqdm.auto import tqdm

In [2]:
base_dir = '/home/mma6789/Stuff/Studies/sem3/ms_project' #@param {type: 'string'}

dataset = 'US-101' #@param ['I-80', 'US-101']
t_o = 5000 #@param [3000, 4000, 5000] -> observation horizon
t_p = 3500 #@param [2000, 2500, 3000, 3500] -> prediction horizon

In [3]:
def series_to_data(series):
    observed_instances = t_o // 100
    prediction_skip = t_p // 100
    
    df = pd.DataFrame(series)
    
    out = df.filter([4], axis=1)
    out = out.shift(-(observed_instances + prediction_skip))
    out.rename(columns={4: 'y'}, inplace=True)

    cols, names = list(), list()
    
    for i in range(observed_instances, 0, -1):
        cols.append(df.shift(i))
        names += [('x%d(t-%d)' % (j, i)) for j in range(5)]
    
    cols.append(out)
    names += [('y')]
    
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    agg.dropna(inplace=True)
    
    agg['LCL'] = 0.0
    agg['LK'] = 0.0
    agg['LCR'] = 0.0
    
    for i, row in agg.iterrows():
        if row['y'] > row['x4(t-1)']:
            agg.at[i,'LCR'] = 1.0
        elif row['y'] < row['x4(t-1)']:
            agg.at[i,'LCL'] = 1.0
        else:
            agg.at[i,'LK'] = 1.0
    
    agg.drop(columns=['y'], inplace=True)
    
    return agg

def preprocess_data():
    # structure of dataset folders, may be different for other implementations
    all_files = os.listdir(f'{base_dir}/data/raw/{dataset}')
    csv_files = [f for f in all_files if f.endswith('.csv')]
    
    df_list = []

    for csv in tqdm(csv_files, desc='Files'):
        file = os.path.join(f'{base_dir}/data/raw/{dataset}', csv)
        file_name = os.path.basename(file)
        
        df = pd.read_csv(file)
        df = df[['Vehicle_ID', 'Local_X', 'Local_Y', 'v_Vel', 'v_Acc', 'Lane_ID']]
        
        scaler = MinMaxScaler(feature_range=(0, 1))
        df[['Local_X', 'Local_Y', 'v_Vel', 'v_Acc', 'Lane_ID']] = scaler.fit_transform(df[['Local_X', 'Local_Y', 'v_Vel', 'v_Acc', 'Lane_ID']])

        vehicles = df.Vehicle_ID.unique()
        
        for vehicle in tqdm(vehicles, desc='Vehicles'):
            vehicle_rows = df.loc[df['Vehicle_ID'] == vehicle]
            vehicle_series = []
            for _, row in vehicle_rows.iterrows():
                vehicle_series.append([row['Local_X'], row['Local_Y'], row['v_Vel'], row['v_Acc'], row['Lane_ID']])
            
            df_list.append(series_to_data(vehicle_series))
        
    processed_data = pd.concat(df_list, ignore_index=True)
    
    class_values = processed_data[['LCL', 'LK', 'LCR']].idxmax(1)
    class_weights = compute_class_weight('balanced', classes=['LCL', 'LK', 'LCR'], y=class_values)
    
    np.save(f'{base_dir}/data/processed/{dataset}/{t_o}_{t_p}_class_weights.npy', class_weights)
    
    train, valid, test = np.split(processed_data.sample(frac=1, random_state=42), [int(0.7*len(processed_data)), int(0.85*len(processed_data))])
    
    train = train.to_numpy()
    valid = valid.to_numpy()
    test = test.to_numpy()
    
    np.save(f'{base_dir}/data/processed/{dataset}/{t_o}_{t_p}_train.npy', train)
    np.save(f'{base_dir}/data/processed/{dataset}/{t_o}_{t_p}_valid.npy', valid)
    np.save(f'{base_dir}/data/processed/{dataset}/{t_o}_{t_p}_test.npy', test)

In [4]:
if __name__ == '__main__':
    preprocess_data()

Files:   0%|          | 0/3 [00:00<?, ?it/s]

Vehicles:   0%|          | 0/1993 [00:00<?, ?it/s]

Vehicles:   0%|          | 0/1298 [00:00<?, ?it/s]

Vehicles:   0%|          | 0/1533 [00:00<?, ?it/s]