In [1]:
import pandas as pd

## Read the dataset

In [3]:
df = pd.read_csv('../../../sample_data/qualifying.csv')

In [4]:
df.head()

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10254 entries, 0 to 10253
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   qualifyId      10254 non-null  int64 
 1   raceId         10254 non-null  int64 
 2   driverId       10254 non-null  int64 
 3   constructorId  10254 non-null  int64 
 4   number         10254 non-null  int64 
 5   position       10254 non-null  int64 
 6   q1             10254 non-null  object
 7   q2             10241 non-null  object
 8   q3             10227 non-null  object
dtypes: int64(6), object(3)
memory usage: 721.1+ KB


## Split a column

In [7]:
df[['q1_minute', 'q1_second', 'q1_millisecond']] = df['q1'].str.extract(r'(\d+):(\d+)\.(\d+)')
df['q1_minute'] = pd.to_numeric(df['q1_minute'], errors='coerce')
df['q1_second'] = pd.to_numeric(df['q1_second'], errors='coerce')
df['q1_millisecond'] = pd.to_numeric(df['q1_millisecond'], errors='coerce')

In [8]:
df.drop(columns='q1',inplace=True)

In [9]:
df[['q2_minute', 'q2_second', 'q2_millisecond']] = df['q2'].str.extract(r'(\d+):(\d+)\.(\d+)')
df['q2_minute'] = pd.to_numeric(df['q2_minute'], errors='coerce')
df['q2_second'] = pd.to_numeric(df['q2_second'], errors='coerce')
df['q2_millisecond'] = pd.to_numeric(df['q2_millisecond'], errors='coerce')

In [10]:
df.drop(columns='q2',inplace=True)

In [11]:
df[['q3_minute', 'q3_second', 'q3_millisecond']] = df['q3'].str.extract(r'(\d+):(\d+)\.(\d+)')
df['q3_minute'] = pd.to_numeric(df['q3_minute'], errors='coerce')
df['q3_second'] = pd.to_numeric(df['q3_second'], errors='coerce')
df['q3_millisecond'] = pd.to_numeric(df['q3_millisecond'], errors='coerce')

In [12]:
df.drop(columns='q3',inplace=True)

## Missing values

In [14]:
df.interpolate(method='linear', inplace=True)

In [15]:
df

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1_minute,q1_second,q1_millisecond,q2_minute,q2_second,q2_millisecond,q3_minute,q3_second,q3_millisecond
0,1,18,1,1,22,1,1.0,26.0,572.0,1.0,25.0,187.0,1.0,26.0,714.0
1,2,18,9,2,4,2,1.0,26.0,103.0,1.0,25.0,315.0,1.0,26.0,869.0
2,3,18,5,1,23,3,1.0,25.0,664.0,1.0,25.0,452.0,1.0,27.0,79.0
3,4,18,13,6,2,4,1.0,25.0,994.0,1.0,25.0,691.0,1.0,27.0,178.0
4,5,18,2,2,3,5,1.0,25.0,960.0,1.0,25.0,518.0,1.0,27.0,236.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10249,10307,1132,822,15,77,16,1.0,32.0,431.0,1.0,27.0,949.0,1.0,26.0,917.0
10250,10308,1132,825,210,20,17,1.0,32.0,905.0,1.0,27.0,949.0,1.0,26.0,917.0
10251,10309,1132,839,214,31,18,1.0,34.0,557.0,1.0,27.0,949.0,1.0,26.0,917.0
10252,10310,1132,815,9,11,19,1.0,38.0,348.0,1.0,27.0,949.0,1.0,26.0,917.0


## Race statistics

In [17]:
df['q1_total_milliseconds'] = (
    df['q1_minute'] * 60 * 1000 +
    df['q1_second'] * 1000 +
    df['q1_millisecond']
)

df['q2_total_milliseconds'] = (
    df['q2_minute'] * 60 * 1000 +
    df['q2_second'] * 1000 +
    df['q2_millisecond']
)

df['q3_total_milliseconds'] = (
    df['q3_minute'] * 60 * 1000 +
    df['q3_second'] * 1000 +
    df['q3_millisecond']
)


In [18]:
raceid_grp = df.groupby('raceId')

In [19]:
x = raceid_grp['q1_total_milliseconds'].mean()
x

raceId
1       86041.75
2       95295.75
3       96447.30
4       93403.20
5       81080.40
          ...   
1128    71852.70
1129    73164.50
1130    72681.05
1131    65598.25
1132    92170.25
Name: q1_total_milliseconds, Length: 482, dtype: float64

## Driver positions

In [21]:
driverid_grp = df.groupby('driverId')

In [22]:
driverid_grp['position'].rolling(window=3, min_periods=1).mean()

driverId       
1         0         1.000000
          25        2.500000
          46        2.666667
          70        4.000000
          90        3.666667
                     ...    
859       9846     16.000000
          9864     14.000000
          9885     11.000000
          9912     13.000000
860       10045    11.000000
Name: position, Length: 10254, dtype: float64

## Constructor standings

In [24]:
constr_grp = df.groupby('constructorId')

In [25]:
def points(position):
    if position == 1:
        return 25
    elif position == 2:
        return 18
    elif position == 3:
        return 15
    elif position == 4:
        return 10
    else:
        return 1

In [26]:
df['points'] = df['position'].apply(points)

In [27]:
constr_grp['points'].sum()

constructorId
1      5103
2       345
3      3775
4      1673
5       566
6      8252
7       632
8        78
9      6622
10      577
11      265
12       34
13       28
14        8
15      755
16      449
17      377
18      238
19      141
20       73
21       52
22      803
23      337
24       77
25      111
27       78
28       42
29       78
30       40
31       37
32       39
33       30
51      207
117     282
131    6992
164     115
166      76
205      76
206     118
207     112
208     325
209      66
210     414
211     178
213     211
214     191
215      24
Name: points, dtype: int64

## Fastest lap time

In [29]:
df['fastest_lap'] = df[['q1_total_milliseconds', 'q2_total_milliseconds', 'q2_total_milliseconds']].min(axis=1)

In [30]:
fastest_lap = raceid_grp[['raceId','fastest_lap']].min()
fastest_lap

Unnamed: 0_level_0,raceId,fastest_lap
raceId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,84783.000000
2,2,93784.000000
3,3,93258.000000
4,4,82483.500000
5,5,76024.500000
...,...,...
1128,1128,70732.000000
1129,1129,71742.000000
1130,1130,65847.166667
1131,1131,64469.000000


## Driver lap time

In [32]:
df['average_time'] = df[['q1_total_milliseconds', 'q2_total_milliseconds', 'q2_total_milliseconds']].mean(axis=1)

In [33]:
driverid_grp['average_time'].apply(lambda x: x.sort_values().mean())

driverId
1      87721.558066
2      87118.286927
3      89722.524079
4      87982.860108
5      90499.098929
           ...     
856    83287.903319
857    83419.057423
858    85119.735450
859    87079.533333
860    88756.000000
Name: average_time, Length: 170, dtype: float64

In [34]:
df[df['driverId']==2]['average_time'].mean()

87118.28692748949