In [2]:
import pandas as pd
import numpy as np
import panel as pn
import holoviews as hv
pn.extension('tabulator', 'plotly')


import plotly.graph_objects as go
import pandas as pd
from plotly.subplots import make_subplots

import pandas as pd
import numpy as np
from typing import List, Dict
import logging

### Data Processing for Career Timeline




In [3]:
quali_data = pd.read_csv('data/quali_data_2018_to_2021.csv')
quali_data.head()

Unnamed: 0,Driver,DriverNumber,LapTime,LapNumber,Stint,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,...,Compound,TyreLife,FreshTyre,Team,LapStartTime,TrackStatus,QualifyingPosition,Year,EventName,WetSession
0,B HARTLEY,28,0 days 00:01:38.713000,2.0,1.0,0 days 00:00:17.567000,0 days 00:00:41.797000,0 days 00:00:39.349000,0 days 00:17:55.973000,0 days 00:18:37.770000,...,HYPERSOFT,2.0,True,Toro Rosso,0 days 00:17:38.406000,1.0,16.0,2018,Abu Dhabi Grand Prix,False
1,B HARTLEY,28,0 days 00:01:38.127000,5.0,2.0,0 days 00:00:17.513000,0 days 00:00:41.709000,0 days 00:00:38.905000,0 days 00:25:30.706000,0 days 00:26:12.415000,...,HYPERSOFT,2.0,True,Toro Rosso,0 days 00:25:13.193000,1.0,16.0,2018,Abu Dhabi Grand Prix,False
2,B HARTLEY,28,0 days 00:01:37.994000,8.0,3.0,0 days 00:00:17.450000,0 days 00:00:41.589000,0 days 00:00:38.955000,0 days 00:33:01.972000,0 days 00:33:43.561000,...,HYPERSOFT,2.0,True,Toro Rosso,0 days 00:32:44.522000,1.0,16.0,2018,Abu Dhabi Grand Prix,False
3,C LECLERC,16,0 days 00:01:38.968000,2.0,1.0,0 days 00:00:17.271000,0 days 00:00:41.969000,0 days 00:00:39.728000,0 days 00:17:34.899000,0 days 00:18:16.868000,...,HYPERSOFT,2.0,True,Sauber,0 days 00:17:17.628000,1.0,8.0,2018,Abu Dhabi Grand Prix,False
4,C LECLERC,16,0 days 00:02:22.825000,3.0,1.0,0 days 00:00:29.037000,0 days 00:01:01.227000,0 days 00:00:52.561000,0 days 00:19:25.633000,0 days 00:20:26.860000,...,HYPERSOFT,3.0,True,Sauber,0 days 00:18:56.596000,1.0,8.0,2018,Abu Dhabi Grand Prix,False


In [4]:
print(quali_data[['Sector1Time', 'Sector2Time', 'Sector3Time']].dtypes)

Sector1Time    object
Sector2Time    object
Sector3Time    object
dtype: object


In [5]:
# Convert time columns from string to Timedelta
quali_data['LapTime'] = pd.to_timedelta(quali_data['LapTime'])
quali_data['Sector1Time'] = pd.to_timedelta(quali_data['Sector1Time'])
quali_data['Sector2Time'] = pd.to_timedelta(quali_data['Sector2Time'])
quali_data['Sector3Time'] = pd.to_timedelta(quali_data['Sector3Time'])
quali_data['Sector1SessionTime'] = pd.to_timedelta(quali_data['Sector1SessionTime'])
quali_data['Sector2SessionTime'] = pd.to_timedelta(quali_data['Sector2SessionTime'])
quali_data['Sector3SessionTime'] = pd.to_timedelta(quali_data['Sector3SessionTime'])
quali_data['LapStartTime'] = pd.to_timedelta(quali_data['LapStartTime'])

print(quali_data[['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 
                  'Sector2SessionTime', 'Sector3SessionTime', 'LapStartTime']].dtypes)


# Now convert the Timedelta columns to total seconds
quali_data['LapTimeSeconds'] = quali_data['LapTime'].apply(lambda x: x.total_seconds())
quali_data['Sector1TimeSeconds'] = quali_data['Sector1Time'].apply(lambda x: x.total_seconds())
quali_data['Sector2TimeSeconds'] = quali_data['Sector2Time'].apply(lambda x: x.total_seconds())
quali_data['Sector3TimeSeconds'] = quali_data['Sector3Time'].apply(lambda x: x.total_seconds())

quali_data['Sector1SessionTimeSeconds'] = quali_data['Sector1SessionTime'].apply(lambda x: x.total_seconds())
quali_data['Sector2SessionTimeSeconds'] = quali_data['Sector2SessionTime'].apply(lambda x: x.total_seconds())
quali_data['Sector3SessionTimeSeconds'] = quali_data['Sector3SessionTime'].apply(lambda x: x.total_seconds())

# Lap Start Time should also be converted to seconds for consistency
quali_data['LapStartTimeSeconds'] = quali_data['LapStartTime'].apply(lambda x: x.total_seconds())


LapTime               timedelta64[ns]
Sector1Time           timedelta64[ns]
Sector2Time           timedelta64[ns]
Sector3Time           timedelta64[ns]
Sector1SessionTime    timedelta64[ns]
Sector2SessionTime    timedelta64[ns]
Sector3SessionTime    timedelta64[ns]
LapStartTime          timedelta64[ns]
dtype: object


### Calculate Metrics for Career Timeline

In [2]:
import pandas as pd
import numpy as np
from typing import List, Dict
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def validate_pole_position(event_data: pd.DataFrame, event_name: str, year: int) -> pd.DataFrame:
    """
    Validate pole position data and identify any inconsistencies.
    Returns the verified pole position data.
    """
    # Get all drivers marked as pole position
    pole_entries = event_data[event_data['QualifyingPosition'] == 1]
    
    if pole_entries.empty:
        logger.error(f"No pole position entry found for {event_name} {year}")
        return None, None
    
    pole_driver = pole_entries['Driver'].iloc[0]
    pole_time = pole_entries['LapTimeSeconds'].min()
        
    if len(pole_entries['Driver'].unique()) > 1:
        logger.error(f"Multiple pole position entries found for {event_name} {year}:")
        for _, entry in pole_entries.iterrows():
            logger.error(f"Driver: {entry['Driver']}, Time: {entry['LapTimeSeconds']}")
    
    # Find the actual fastest lap
    fastest_lap = event_data.loc[event_data['LapTimeSeconds'].idxmin()]
    
    # Check if pole position entry matches fastest lap
    if fastest_lap['LapTimeSeconds'] < pole_time:
        logger.error(
            f"Inconsistency at {event_name} {year}:\n"
            f"Pole Position: {pole_driver} ({pole_time}s)\n"
            f"Fastest Lap: {fastest_lap['Driver']} ({fastest_lap['LapTimeSeconds']}s)\n"
            f"Difference: {pole_time - fastest_lap['LapTimeSeconds']}s"
        )
    
    return pole_driver, pole_time

def process_qualifying_data(quali_data: pd.DataFrame) -> List[Dict]:
    """
    Process F1 qualifying data with enhanced validation and debugging.
    
    Args:
        quali_data: DataFrame with columns including Driver, Team, QualifyingPosition,
                   LapTimeSeconds, Year, EventName, etc.
    
    Returns:
        List of dictionaries containing processed qualifying data per driver per year
    """
    timeline_data = []
    
    # Verify required columns
    required_columns = ['Year', 'EventName', 'Team', 'Driver', 'QualifyingPosition', 
                       'LapTimeSeconds', 'LapNumber', 'Stint', 'TrackStatus']
    missing_columns = [col for col in required_columns if col not in quali_data.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    for year in quali_data['Year'].unique():
        year_data = quali_data[quali_data['Year'] == year]
        
        for event_name, event_data in year_data.groupby('EventName'):
            #logger.info(f"Processing {event_name} {year}")
            
            # Get best lap time for each driver at this event
            driver_best_laps = event_data.groupby('Driver')['LapTimeSeconds'].min()
            #logger.info("\nBest lap times:")
            #for driver, time in driver_best_laps.items():
                #logger.info(f"{driver}: {time:.3f}s")
            
            # Validate pole position data
            pole_driver, pole_time = validate_pole_position(event_data, event_name, year)
            if pole_driver is None:
                continue
            
            team_groups = event_data.groupby('Team')
            
            for team, team_data in team_groups:
                drivers = team_data['Driver'].unique()
                
                # Initialize gaps dict
                gaps = {driver: np.nan for driver in drivers}
                
                # Calculate teammate gaps only if we have exactly 2 drivers
                if len(drivers) == 2:
                    driver_times = {
                        driver: team_data[team_data['Driver'] == driver]['LapTimeSeconds'].min()
                        for driver in drivers
                    }
                    driver1, driver2 = drivers
                    time1, time2 = driver_times[driver1], driver_times[driver2]
                    
                    if not (np.isnan(time1) or np.isnan(time2)):
                        gaps.update({
                            driver1: time1 - time2,
                            driver2: time2 - time1
                        })
                else:
                    pass
                    #logger.info(f'Incomplete teammate data: {len(drivers)} driver(s) for {team} at {event_name} {year}')
                
                # Process each driver
                for driver in drivers:
                    driver_data = team_data[team_data['Driver'] == driver]
                    best_lap_time = driver_data['LapTimeSeconds'].min()
                    qualifying_position = driver_data['QualifyingPosition'].iloc[0]
                    
                    # Calculate gap to pole
                    if qualifying_position == 1:
                        if driver != pole_driver:
                            logger.error(
                                f"Driver position mismatch at {event_name} {year}:\n"
                                f"Driver {driver} marked as P1 but pole was set by {pole_driver}"
                            )
                        gap_to_pole = 0.0
                    else:
                        gap_to_pole = best_lap_time - pole_time
                        
                        if gap_to_pole < 0:
                            logger.error(
                                f"Negative gap detected at {event_name} {year}:\n"
                                f"Driver: {driver} (P{qualifying_position})\n"
                                f"Best lap: {best_lap_time:.3f}s\n"
                                f"Pole time: {pole_time:.3f}s\n"
                                f"Gap: {gap_to_pole:.3f}s\n"
                                f"All laps for {driver}:"
                            )
                            # Print all laps for this driver
                            for _, lap in driver_data.iterrows():
                                logger.error(
                                    f"Lap {lap['LapNumber']}: {lap['LapTimeSeconds']:.3f}s "
                                    f"(Stint: {lap['Stint']}, Track Status: {lap['TrackStatus']})"
                                )
                            gap_to_pole = np.nan  # Set invalid gaps to NaN
                    
                    # Create event summary
                    event_summary = {
                        'round': event_name,
                        'driver': driver,
                        'position': qualifying_position,
                        'gapToPole': gap_to_pole,
                        'teammateGap': gaps[driver],
                        'hasTeammateData': not np.isnan(gaps[driver])
                    }
                    
                    # Find or create driver entry for this year
                    driver_entry = next(
                        (item for item in timeline_data if item['year'] == year and item['driver'] == driver),
                        None
                    )
                    
                    if driver_entry is None:
                        driver_entry = {
                            'year': year,
                            'driver': driver,
                            'team': team,
                            'events': [],
                            'positions': [],
                            'gapToPole_values': [],
                            'teammateGap_values': [],
                            'completeDataCount': 0,
                            'totalEvents': 0
                        }
                        timeline_data.append(driver_entry)
                    
                    driver_entry['events'].append(event_summary)
                    driver_entry['positions'].append(event_summary['position'])
                    driver_entry['totalEvents'] += 1
                    
                    if not np.isnan(gap_to_pole):
                        driver_entry['gapToPole_values'].append(gap_to_pole)
                    if not np.isnan(gaps[driver]):
                        driver_entry['teammateGap_values'].append(gaps[driver])
                        driver_entry['completeDataCount'] += 1
    
    # Calculate final averages and data completeness
    for entry in timeline_data:
        entry['avgQualifyingPosition'] = np.mean(entry['positions'])
        entry['avgGapToPole'] = np.mean(entry['gapToPole_values']) if entry['gapToPole_values'] else np.nan
        entry['avgTeammateGap'] = np.mean(entry['teammateGap_values']) if entry['teammateGap_values'] else np.nan
        entry['dataCompleteness'] = entry['completeDataCount'] / entry['totalEvents'] if entry['totalEvents'] > 0 else 0
        
        # Clean up intermediate calculation fields
        del entry['positions']
        del entry['gapToPole_values']
        del entry['teammateGap_values']
        del entry['completeDataCount']
        del entry['totalEvents']
    
    return timeline_data

In [7]:
process_qualifying_data(quali_data)


ERROR:__main__:Inconsistency at Belgian Grand Prix 2018:
Pole Position: L HAMILTON (101.553s)
Fastest Lap: S VETTEL (101.501s)
Difference: 0.0519999999999925s
ERROR:__main__:Negative gap detected at Belgian Grand Prix 2018:
Driver: K RAIKKONEN (P6.0)
Best lap: 101.533s
Pole time: 101.553s
Gap: -0.020s
All laps for K RAIKKONEN:
ERROR:__main__:Lap 2.0: 102.585s (Stint: 1.0, Track Status: 1.0)
ERROR:__main__:Lap 5.0: 101.627s (Stint: 2.0, Track Status: 1.0)
ERROR:__main__:Lap 8.0: 101.533s (Stint: 3.0, Track Status: 12.0)
ERROR:__main__:Lap 12.0: 122.671s (Stint: 5.0, Track Status: 1.0)
ERROR:__main__:Negative gap detected at Belgian Grand Prix 2018:
Driver: S VETTEL (P2.0)
Best lap: 101.501s
Pole time: 101.553s
Gap: -0.052s
All laps for S VETTEL:
ERROR:__main__:Lap 2.0: 103.035s (Stint: 1.0, Track Status: 1.0)
ERROR:__main__:Lap 3.0: 147.458s (Stint: 1.0, Track Status: 1.0)
ERROR:__main__:Lap 4.0: 103.517s (Stint: 1.0, Track Status: 1.0)
ERROR:__main__:Lap 7.0: 102.133s (Stint: 2.0, Trac

[{'year': 2018,
  'driver': 'K RAIKKONEN',
  'team': 'Ferrari',
  'events': [{'round': 'Abu Dhabi Grand Prix',
    'driver': 'K RAIKKONEN',
    'position': 4.0,
    'gapToPole': 0.570999999999998,
    'teammateGap': 0.23999999999999488,
    'hasTeammateData': True},
   {'round': 'Australian Grand Prix',
    'driver': 'K RAIKKONEN',
    'position': 2.0,
    'gapToPole': 0.6640000000000015,
    'teammateGap': -0.009999999999990905,
    'hasTeammateData': True},
   {'round': 'Austrian Grand Prix',
    'driver': 'K RAIKKONEN',
    'position': 4.0,
    'gapToPole': 0.529999999999994,
    'teammateGap': 0.19599999999999795,
    'hasTeammateData': True},
   {'round': 'Azerbaijan Grand Prix',
    'driver': 'K RAIKKONEN',
    'position': 6.0,
    'gapToPole': 0.9919999999999902,
    'teammateGap': 0.9919999999999902,
    'hasTeammateData': True},
   {'round': 'Bahrain Grand Prix',
    'driver': 'K RAIKKONEN',
    'position': 2.0,
    'gapToPole': 0.14300000000000068,
    'teammateGap': 0.143000