# Data Cleaining

While we collected and engineered the features, it is still real world data, so there is some missing values. Primarily, it occured if a driver did not complete a session or sessions, but still participated in the final race. Since the races only have 20 drivers each, and eventually our targets will be limited and specific to the races, we did not want to throw out the data. We are going to replace the NaNs with the means of the columns.

In [2]:
#| label: import
import pandas as pd
import json
import numpy as np
import os

In [10]:

def list_files_in_folder(folder_path):
    files = []
    for entry in os.listdir(folder_path):
        full_path = os.path.join(folder_path, entry)
        if os.path.isfile(full_path):
            files.append(full_path)
    return files

raw_data_list = list_files_in_folder("data/raw_data")
#print(raw_data_list)

I realized that the row labels were off (weather before car data, not vice versa), so Im going to fix those at the same time as filling in the NaNs.

In [7]:
colnames = ['meeting_key', 'driver_num',
    # P1
    'min_lap_p1', 'max_lap_p1', 'avg_lap_p1', 'num_laps_p1', 'num_stints_p1', 'num_pits_p1', 'avg_pit_time_p1',
    'did_rain_p1', 'max_wind_p1', 'avg_air_temp_p1', 'avg_track_temp_p1',
    'max_brake_p1', 'min_rpm_p1', 'max_rpm_p1', 'avg_rpm_p1', 'max_throttle_p1', 'avg_throttle_p1',
    'min_speed_p1', 'max_speed_p1', 'avg_speed_p1',

    # P2
    'min_lap_p2', 'max_lap_p2', 'avg_lap_p2', 'num_laps_p2', 'num_stints_p2', 'num_pits_p2', 'avg_pit_time_p2',
    'did_rain_p2', 'max_wind_p2', 'avg_air_temp_p2', 'avg_track_temp_p2',
    'max_brake_p2', 'min_rpm_p2', 'max_rpm_p2', 'avg_rpm_p2', 'max_throttle_p2', 'avg_throttle_p2',
    'min_speed_p2', 'max_speed_p2', 'avg_speed_p2',

    # P3
    'min_lap_p3', 'max_lap_p3', 'avg_lap_p3', 'num_laps_p3', 'num_stints_p3', 'num_pits_p3', 'avg_pit_time_p3',
    'did_rain_p3', 'max_wind_p3', 'avg_air_temp_p3', 'avg_track_temp_p3',
    'max_brake_p3', 'min_rpm_p3', 'max_rpm_p3', 'avg_rpm_p3', 'max_throttle_p3', 'avg_throttle_p3',
    'min_speed_p3', 'max_speed_p3', 'avg_speed_p3',

    # Q
    'min_lap_q', 'max_lap_q', 'avg_lap_q', 'num_laps_q', 'num_stints_q', 'num_pits_q', 'avg_pit_time_q',
    'did_rain_q', 'max_wind_q', 'avg_air_temp_q', 'avg_track_temp_q',
    'max_brake_q', 'min_rpm_q', 'max_rpm_q', 'avg_rpm_q', 'max_throttle_q', 'avg_throttle_q',
    'min_speed_q', 'max_speed_q', 'avg_speed_q'
]


In [11]:
# Output folder
output_dir = 'data/fixed_data'
os.makedirs(output_dir, exist_ok=True)

for f in raw_data_list:
    df = pd.read_csv(f) 

    if len(df.columns) != len(colnames):
        print(f"Column mismatch in {f} (expected {len(colnames)} columns, got {len(df.columns)})")
        continue

    # fix the column names
    df.columns = colnames

    # fill in the missing NaNs with the means for the numeric columns
    df_filled = df.fillna(df.mean(numeric_only=True))

    filename = os.path.basename(f)  
    output_path = os.path.join(output_dir, filename)
    print(output_path)

    df_filled.to_csv(output_path, index=False)

data/fixed_data/racing_profiles_1229.csv
data/fixed_data/racing_profiles_1214.csv
data/fixed_data/racing_profiles_1217.csv
data/fixed_data/racing_profiles_1212.csv
data/fixed_data/racing_profiles_1210.csv
data/fixed_data/racing_profiles_1238.csv
data/fixed_data/racing_profiles_1211.csv
data/fixed_data/racing_profiles_1248.csv
data/fixed_data/racing_profiles_1256.csv
data/fixed_data/racing_profiles_1241.csv
data/fixed_data/racing_profiles_1245.csv
data/fixed_data/racing_profiles_1250.csv
data/fixed_data/racing_profiles_1244.csv
data/fixed_data/racing_profiles_1252.csv
data/fixed_data/racing_profiles_1246.csv
data/fixed_data/racing_profiles_1142.csv
data/fixed_data/racing_profiles_1208.csv
data/fixed_data/racing_profiles_1220.csv
data/fixed_data/racing_profiles_1235.csv
data/fixed_data/racing_profiles_1143.csv
data/fixed_data/racing_profiles_1141.csv
data/fixed_data/racing_profiles_1223.csv
data/fixed_data/racing_profiles_1237.csv
data/fixed_data/racing_profiles_1236.csv
data/fixed_data/

Now Im going to combine them all into one dataset for later use.

In [15]:
df_list = []

fixed_data_list = list_files_in_folder("data/fixed_data")

for f in fixed_data_list:
    try:
        df = pd.read_csv(f)
        df_list.append(df)
    except Exception as e:
        print(f"Failed to load {f}: {e}")

# Combine all dataframes into one
combined_df = pd.concat(df_list, ignore_index=True)

# save to csv file
combined_df.to_csv("data/processed_data/all_races.csv", index=False)