In [19]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

# datasets are the individual .csv's
# function to load CSV files from ./archive and return a dictionary.
# The key is the file name, and the value is the DataFrame
def load_datasets(folder_path):
    folder_path = "./archive"  # Hardcoded path to ./archive
    data_dict = {}  # Create an empty dictionary to store DataFrames
    csv_files = os.listdir(folder_path)  # List all files in the folder
    for file in csv_files:
        if file.endswith(".csv"):  # Select only CSV files
            full_path = os.path.join(folder_path, file)  # Full path to file
            df = pd.read_csv(full_path)  # Read CSV into DataFrame
            data_dict[file] = df  # Add to dictionary
    return data_dict



# function explores basic info about a selected dataset, such as its first 5 rows, column names, data types, etc.
def explore_dataset(df):
    print("\n---First 5 rows---")
    print(df.head())  # Display first 5 rows
    
    print("\n---Column names---")
    print(df.columns)  # List column names
    
    print("\n---Data types---")
    print(df.dtypes)  # Display column data types
    
    print("\n---Missing values---")
    print(df.isnull().sum())  # Show missing values per column
    
    print("\n---Summary stats---")
    print(df.describe())  # Summary statistics for numerical columns
    print('\n')

# function lists all available datasets (i.e., print the keys from the `data_dict`).
def list_available_datasets(data_dict):
    print("\n---Available datasets---\n")
    for dataset in data_dict.keys():  # Iterate over each file name in the dictionary
        print(dataset)  # Print the file name

# select a dataset from the available list and explore it
def select_and_explore_dataset(data_dict):
    list_available_datasets(data_dict) #lists all datasets in data_dict
    ds_choice = input("\nSelect a dataset to explore: ")
    if ds_choice in data_dict.keys(): #checks if users choice is in data_dict
        print(f"\n---Exploring {ds_choice}---")
        explore_dataset(data_dict[ds_choice]) #if the users choice exists, pass the dataframe to explore_dataset
    else:
        print("---The dataset you selected does not exist.---")


# Merge two datasets (`results.csv` and `races.csv`) based on common column like `raceId`.
def merge_results_races(data_dict):
    results_df = data_dict["results.csv"]  # Loads race results data from the dict (results.csv)
    races_df = data_dict["races.csv"]  # Load the race details data from the dictionary (races.csv)
    merged_race_data = pd.merge(results_df, races_df, on='raceId', how="left")  
    return merged_race_data  # Return the merged DataFrame

In [20]:
def clean_data(dataframe):
    dataframe.replace({'\\N': pd.NA}, inplace=True)

    columns_to_remove = [
        'resultId', 'position', 'positionOrder', 'time_x', 'fastestLapTime', 
        'fastestLapSpeed', 'number', 'rank', 'url', 
        'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 
        'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 
        'sprint_date', 'sprint_time', 'date', 'time_y'
    ]
    dataframe.drop(columns=columns_to_remove, errors='ignore', inplace=True)

    essential_columns = ['raceId', 'driverId', 'grid', 'laps', 'year']
    dataframe.dropna(subset=essential_columns, inplace=True)

    return dataframe

In [22]:
cleaned_data = clean_data(merged_data)

print("\nAfter further cleaning (updated for predictive purposes):")
print(cleaned_data.head())


After further cleaning (updated for predictive purposes):
   raceId  driverId  constructorId  grid  points  laps milliseconds  \
0      18         1              1     1    10.0    58      5690616   
1      18         2              2     5     8.0    58      5696094   
2      18         3              3     7     6.0    58      5698779   
3      18         4              4    11     5.0    58      5707797   
4      18         5              1     3     4.0    58      5708630   

  fastestLap  statusId  year  round  circuitId                   name  
0         39         1  2008      1          1  Australian Grand Prix  
1         41         1  2008      1          1  Australian Grand Prix  
2         41         1  2008      1          1  Australian Grand Prix  
3         58         1  2008      1          1  Australian Grand Prix  
4         43         1  2008      1          1  Australian Grand Prix  
