In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/processed/0.1-initial-exploration.csv',index_col=False)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

df.loc[:,'positionOrder'] = pd.to_numeric(df.loc[:, 'positionOrder'], errors='coerce')
df.loc[:,'ageAtRace'] = pd.to_numeric(df.loc[:, 'ageAtRace'], errors='coerce')
df.loc[:,'ageAtDebut'] = pd.to_numeric(df.loc[:, 'ageAtDebut'], errors='coerce')
df.loc[:,'yearsExperience'] = pd.to_numeric(df.loc[:, 'yearsExperience'], errors='coerce')

**Sort by year and round**

In [3]:
df = df.sort_values(by=['year','round'])

In [4]:
df['finished'] = df['position'] != '\\N'

**Carry forward last Driver Standings statistics**

In [5]:
df['driverStandingsPoints'] = df.groupby(['driverId'])['driverStandingsPoints'].transform(lambda x: x.ffill())
df['driverStandingsPosition'] = df.groupby(['driverId'])['driverStandingsPosition'].transform(lambda x: x.ffill())
df['driverStandingsWins'] = df.groupby(['driverId'])['driverStandingsWins'].transform(lambda x: x.ffill())

**Carry forward last Constructor Standings statistics**

In [6]:
df['constructorStandingsPoints'] = df.groupby(['constructorId'])['constructorStandingsPoints'].transform(lambda x: x.ffill())
df['constructorStandingsPosition'] = df.groupby(['constructorId'])['constructorStandingsPosition'].transform(lambda x: x.ffill())
df['constructorStandingsWins'] = df.groupby(['constructorId'])['constructorStandingsWins'].transform(lambda x: x.ffill())

**Drop unwanted columns**

In [7]:
df = df.drop(['resultId','raceId','circuitId','rank','driverStandingsId','constructorStandingsId','position','positionText','pointsGained',
'laps','nationality','time_x','milliseconds','fastestLapSpeed','fastestLapTime','fastestLap','statusId'
,'dateOfDebut','status','driverStandingsId','driverStandingsPoints','constructorStandingsPoints'], 1)

  df = df.drop(['resultId','raceId','circuitId','rank','driverStandingsId','constructorStandingsId','position','positionText','pointsGained',


**Columns with null values**

In [8]:
print('Columns with null values:\n', df.columns[df.isna().any()].tolist())

Columns with null values:
 ['lastRaceRank', 'constructorStandingsPosition', 'constructorStandingsWins', 'previousRaceGridStart', 'previousRacePosition', 'racesWon', 'racesRetired', 'racesFinished', 'polePositions', 'racesWonByConstructor', 'racesRetiredByConstructor', 'percentageOfBestQuali']


**Replace Nans with 0 or -1 depending on the column**

In [9]:
df[['racesWon','racesRetired','racesFinished','polePositions','racesWonByConstructor','racesRetiredByConstructor']] = df[['racesWon','racesRetired','racesFinished','polePositions','racesWonByConstructor','racesRetiredByConstructor']].fillna(value=0)
df[['lastRaceRank','previousRaceGridStart','previousRacePosition']] = df[['lastRaceRank','previousRaceGridStart','previousRacePosition']].fillna(value=-1)

**Replace missing percentage of best qualifying time**

In [10]:
#df['percentageOfBestQuali'] = df.groupby(['driverId','year'])['percentageOfBestQuali'].transform(lambda x: x.fillna(x.mean()))
df['percentageOfBestQuali'] = df.groupby(['driverId','year'])['percentageOfBestQuali'].transform(lambda x: x.ffill())
df['percentageOfBestQuali'] = df.groupby(['year','round'])['percentageOfBestQuali'].transform(lambda x: x.fillna(x.mean()))
df['percentageOfBestQuali'] = df.groupby(['year','round'])['percentageOfBestQuali'].transform(lambda x: x.fillna(100))

**Chech which columns still have missing values**

In [11]:
print('Columns with null values:\n', df.columns[df.isna().any()].tolist())
df[df.isnull().any(axis=1)].head(20)

Columns with null values:
 ['constructorStandingsPosition', 'constructorStandingsWins']


Unnamed: 0,driverId,constructorId,gridStart,positionOrder,driverRef,year,round,circuitName,date,ageAtRace,...,previousRaceGridStart,previousRacePosition,racesWon,racesRetired,racesFinished,polePositions,racesWonByConstructor,racesRetiredByConstructor,percentageOfBestQuali,finished
21179,529,113,9,5,bettenhausen,1950,3,Indianapolis 500,1950-05-30,33,...,8.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,True
21180,529,113,9,5,bettenhausen,1950,3,Indianapolis 500,1950-05-30,33,...,8.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,True
22445,697,163,23,27,bonetto,1950,7,Italian Grand Prix,1950-09-03,47,...,10.0,10.0,0.0,1.0,1.0,0.0,0.0,0.0,100.0,False
21988,633,132,5,20,villoresi,1954,9,Spanish Grand Prix,1954-10-24,45,...,6.0,13.0,0.0,9.0,15.0,1.0,0.0,0.0,100.0,False
22146,647,132,1,19,ascari,1954,9,Spanish Grand Prix,1954-10-24,36,...,2.0,12.0,13.0,11.0,22.0,13.0,0.0,0.0,100.0,False
20242,427,130,18,17,trintignant,1956,5,French Grand Prix,1956-07-01,38,...,7.0,10.0,1.0,21.0,16.0,1.0,0.0,0.0,100.0,False


Only 6 rows have missing constructor standings information now, all of which are from 1956 or before.

**Fill missing rank (ordering of drivers by previous race fastest lap during race) values with 0**

In [12]:
df['lastRaceRank'] = df['lastRaceRank'].replace({'\\N': 0})
df['lastRaceRank'] = df['lastRaceRank'].astype(str).astype(int)

**Replace grid starting position 0 (pit lane start) with the last starting position of that race +1**

In [13]:
df['gridStart'] = df.groupby(['year','round'])['gridStart'].transform(lambda x: x.replace(to_replace = 0, value = x.max()+1))

In [14]:
df['racingAtHome'] = df["racingAtHome"].astype(int)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25593 entries, 21554 to 25571
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   driverId                      25593 non-null  int64  
 1   constructorId                 25593 non-null  int64  
 2   gridStart                     25593 non-null  int64  
 3   positionOrder                 25593 non-null  int64  
 4   driverRef                     25593 non-null  object 
 5   year                          25593 non-null  int64  
 6   round                         25593 non-null  int64  
 7   circuitName                   25593 non-null  object 
 8   date                          25593 non-null  object 
 9   ageAtRace                     25593 non-null  int64  
 10  ageAtDebut                    25593 non-null  int64  
 11  yearsExperience               25593 non-null  int64  
 12  racingAtHome                  25593 non-null  int64  
 1

**Add artificial entry for last race (round 22) of 2021 for Mazepin. He couldnt race because of covid at the time**

In [16]:
new_row = {'driverId':853, 'constructorId':210, 'gridStart':20, 'positionOrder':20, 'driverRef': 'mazepin',
 'year':2021, 'round':22, 'circuitName':'Abu Dhabi Grand Prix', 'date':'2021-12-12', 'ageAtRace':22, 'ageAtDebut':22,
  'yearsExperience':0, 'racingAtHome':0, 'driverStandingsPosition':20,
  'driverStandingsWins':0, 'lastRaceRank':19, 'constructorStandingsPosition':10,
  'constructorStandingsWins':0, 'previousRaceGridStart':19, 'previousRacePosition': 18, 'racesWon':0,
  'racesRetired':4, 'racesFinished': 17, 'polePositions': 0, 'racesWonByConstructor': 0, 'racesRetiredByConstructor': 46,
  'percentageOfBestQuali': 115, 'finished':True}
#append row to the dataframe
df = df.append(new_row, ignore_index=True)

In [17]:
df['percentRetired'] = df['racesRetired'].div(df['racesFinished']).replace(np.inf, 0)
df['percentRetired'] = df['percentRetired'].replace({ np.nan :  0})

In [18]:
df['posGainedLastRace'] = df['previousRaceGridStart'] - df['previousRacePosition']

In [19]:
df = df.drop(['driverId','constructorId'], axis=1)

In [20]:
df.reset_index(drop=True)
df.to_csv('../data/processed/0.3-feature-processing.csv')

In [21]:
df.tail()

Unnamed: 0,gridStart,positionOrder,driverRef,year,round,circuitName,date,ageAtRace,ageAtDebut,yearsExperience,...,racesWon,racesRetired,racesFinished,polePositions,racesWonByConstructor,racesRetiredByConstructor,percentageOfBestQuali,finished,percentRetired,posGainedLastRace
25589,17,18,russell,2021,22,Abu Dhabi Grand Prix,2021-12-12,23,21,2,...,0.0,10.0,49.0,0.0,114.0,399.0,102.818205,False,0.204082,-5.0
25590,16,16,latifi,2021,22,Abu Dhabi Grand Prix,2021-12-12,26,25,1,...,0.0,5.0,33.0,0.0,114.0,399.0,102.714684,False,0.151515,4.0
25591,8,4,tsunoda,2021,22,Abu Dhabi Grand Prix,2021-12-12,21,20,0,...,0.0,3.0,17.0,0.0,1.0,10.0,101.353079,True,0.176471,-6.0
25592,19,14,mick_schumacher,2021,22,Abu Dhabi Grand Prix,2021-12-12,22,22,0,...,0.0,3.0,18.0,0.0,0.0,48.0,103.406448,True,0.166667,-1.0
25593,20,20,mazepin,2021,22,Abu Dhabi Grand Prix,2021-12-12,22,22,0,...,0.0,4.0,17.0,0.0,0.0,46.0,115.0,True,0.235294,1.0
