In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/processed/0.1-initial-exploration.csv',index_col=False)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

df.loc[:,'positionOrder'] = pd.to_numeric(df.loc[:, 'positionOrder'], errors='coerce')
df.loc[:,'ageAtRace'] = pd.to_numeric(df.loc[:, 'ageAtRace'], errors='coerce')
df.loc[:,'ageAtDebut'] = pd.to_numeric(df.loc[:, 'ageAtDebut'], errors='coerce')
df.loc[:,'yearsExperience'] = pd.to_numeric(df.loc[:, 'yearsExperience'], errors='coerce')

**Sort by year and round**

In [3]:
df = df.sort_values(by=['year','round'])

**Carry forward last Driver Standings statistics**

In [4]:
df['driverStandingsPoints'] = df.groupby(['driverId'])['driverStandingsPoints'].transform(lambda x: x.ffill())
df['driverStandingsPosition'] = df.groupby(['driverId'])['driverStandingsPosition'].transform(lambda x: x.ffill())
df['driverStandingsWins'] = df.groupby(['driverId'])['driverStandingsWins'].transform(lambda x: x.ffill())

**Carry forward last Constructor Standings statistics**

In [5]:
df['constructorStandingsPoints'] = df.groupby(['constructorId'])['constructorStandingsPoints'].transform(lambda x: x.ffill())
df['constructorStandingsPosition'] = df.groupby(['constructorId'])['constructorStandingsPosition'].transform(lambda x: x.ffill())
df['constructorStandingsWins'] = df.groupby(['constructorId'])['constructorStandingsWins'].transform(lambda x: x.ffill())

**Drop unwanted columns**

In [6]:
df = df.drop(['driverRef','resultId','raceId','rank','driverStandingsId','constructorStandingsId','position','positionText','pointsGained',
'laps','nationality','time_x','milliseconds','fastestLapSpeed','fastestLapTime','fastestLap','statusId'
,'circuitName','dateOfDebut','status','driverStandingsId'], 1)

**Columns with null values**

In [7]:
print('Columns with null values:\n', df.columns[df.isna().any()].tolist())

Columns with null values:
 ['lastRaceRank', 'constructorStandingsPoints', 'constructorStandingsPosition', 'constructorStandingsWins', 'previousRaceGridStart', 'previousRacePosition', 'racesWon', 'racesRetired', 'racesFinished', 'polePositions', 'racesWonByConstructor', 'racesRetiredByConstructor', 'percentageOfBestQuali']


**Replace Nans with 0 or -1 depending on the column**

In [8]:
df[['racesWon','racesRetired','racesFinished','polePositions','racesWonByConstructor','racesRetiredByConstructor']] = df[['racesWon','racesRetired','racesFinished','polePositions','racesWonByConstructor','racesRetiredByConstructor']].fillna(value=0)
df[['lastRaceRank','previousRaceGridStart','previousRacePosition']] = df[['lastRaceRank','previousRaceGridStart','previousRacePosition']].fillna(value=-1)

**Replace missing percentage of best qualifying time**

In [9]:
#df['percentageOfBestQuali'] = df.groupby(['driverId','year'])['percentageOfBestQuali'].transform(lambda x: x.fillna(x.mean()))
df['percentageOfBestQuali'] = df.groupby(['driverId','year'])['percentageOfBestQuali'].transform(lambda x: x.ffill())
df['percentageOfBestQuali'] = df.groupby(['year','round'])['percentageOfBestQuali'].transform(lambda x: x.fillna(x.mean()))
df['percentageOfBestQuali'] = df.groupby(['year','round'])['percentageOfBestQuali'].transform(lambda x: x.fillna(100))

**Chech which columns still have missing values**

In [10]:
print('Columns with null values:\n', df.columns[df.isna().any()].tolist())
df[df.isnull().any(axis=1)].head(20)

Columns with null values:
 ['constructorStandingsPoints', 'constructorStandingsPosition', 'constructorStandingsWins']


Unnamed: 0,driverId,constructorId,gridStart,positionOrder,year,round,circuitId,date,ageAtRace,ageAtDebut,...,constructorStandingsWins,previousRaceGridStart,previousRacePosition,racesWon,racesRetired,racesFinished,polePositions,racesWonByConstructor,racesRetiredByConstructor,percentageOfBestQuali
21179,529,113,9,5,1950,3,19,1950-05-30,33,33,...,,8.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
21180,529,113,9,5,1950,3,19,1950-05-30,33,33,...,,8.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
22445,697,163,23,27,1950,7,14,1950-09-03,47,47,...,,10.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0,100.0
21988,633,132,5,20,1954,9,67,1954-10-24,45,41,...,,6.0,13.0,0.0,9.0,6.0,1.0,0.0,0.0,100.0
22146,647,132,1,19,1954,9,67,1954-10-24,36,31,...,,2.0,12.0,13.0,11.0,17.0,13.0,0.0,0.0,100.0
20242,427,130,18,17,1956,5,55,1956-07-01,38,32,...,,7.0,10.0,1.0,21.0,4.0,1.0,0.0,0.0,100.0


Only 6 rows have missing constructor standings information now, all of which are from 1956 or before.

**Fill missing rank (ordering of drivers by previous race fastest lap during race) values with 0**

In [11]:
df['lastRaceRank'] = df['lastRaceRank'].replace({'\\N': 0})
df['lastRaceRank'] = df['lastRaceRank'].astype(str).astype(int)

**Replace grid starting position 0 (pit lane start) with the last starting position of that race +1**

In [12]:
df['gridStart'] = df.groupby(['year','round'])['gridStart'].transform(lambda x: x.replace(to_replace = 0, value = x.max()+1))

In [13]:
df['racingAtHome'] = df["racingAtHome"].astype(int)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25593 entries, 21554 to 25571
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   driverId                      25593 non-null  int64  
 1   constructorId                 25593 non-null  int64  
 2   gridStart                     25593 non-null  int64  
 3   positionOrder                 25593 non-null  int64  
 4   year                          25593 non-null  int64  
 5   round                         25593 non-null  int64  
 6   circuitId                     25593 non-null  int64  
 7   date                          25593 non-null  object 
 8   ageAtRace                     25593 non-null  int64  
 9   ageAtDebut                    25593 non-null  int64  
 10  yearsExperience               25593 non-null  int64  
 11  racingAtHome                  25593 non-null  int64  
 12  driverStandingsPoints         25593 non-null  float64
 1

In [15]:
df.reset_index(drop=True)
df.to_csv('../data/processed/0.3-feature-processing.csv')