In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Load the data
data = pd.read_csv('../data/interim/data_clean.csv')

In [3]:
# Create new columns for various performance metrics like goal difference, shot accuracy, etc.
data['gdiff'] = data['gf'] - data['ga']  # Goal difference
data['xgdiff'] = data['xg'] - data['xga']  # Expected goal difference
data['sacc'] = data['sot'] / data['sh']  # Shot accuracy
data['shtconv'] = data['gf'] / data['sh']  # Shot conversion rate
data['sotconv'] = data['gf'] / data['sot']  # Shot on target conversion rate
data['pconv'] = data['pk'] / data['pkatt']  # Penalty conversion rate
data['epr'] = data['xg'] / data['poss']  # Efficient possession ratio

In [4]:
# Handle division by zero in some of the ratio columns
ratios = ['sacc', 'shtconv', 'sotconv', 'pconv']

for ratio in data[ratios]:
    data[ratio] = data[ratio].replace(np.inf, np.nan)

In [5]:
# Add 'points' column
points = []

for result in data['result']:
    if result == 'W':
        points.append(3)
    elif result == 'D':
        points.append(1)
    else:
        points.append(0)

data['points'] = points

In [6]:
# Create 'expected points' measure based on expected goals for and against
exppoints = []

for diff in round(data['xgdiff'], 0):
    if diff > 0: 
        exppoints.append(3)
    elif diff == 0 :
        exppoints.append(1)
    else:
        exppoints.append(0)

data['exppoints'] = exppoints

In [7]:
# Calculate points over the last 5 games and store in 'last_5' column
data['last_5'] = data['points'].transform(lambda x: x.rolling(5, min_periods=1).sum())

In [8]:
#Export DataFrame to a csv file
data.to_csv('../data/processed/data_processed.csv', index=False)