In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.tseries.offsets import DateOffset

# Read in the data
ExerciseTrainingData = pd.read_csv('ExerciseTrainingData.csv')
Jumps = pd.read_csv('Jumps.csv')
Wellness = pd.read_csv('Wellness.csv')
PlayerTrainingData = pd.read_csv('PlayerTrainingData.csv')
StrengthTraining = pd.read_csv('StrengthTraining.csv')

# Convert date columns to date type for all data frames
ExerciseTrainingData['Date'] = pd.to_datetime(ExerciseTrainingData['Date'], format='%d-%m-%Y')
Jumps['Date'] = pd.to_datetime(Jumps['Date'], format='%d-%m-%Y')
Wellness['Date'] = pd.to_datetime(Wellness['Date'], format='%d-%m-%Y')
StrengthTraining['Date'] = pd.to_datetime(StrengthTraining['Date'], format='%d-%m-%Y')

# Merge into one dataframe 
data_frames = [ExerciseTrainingData, Jumps, StrengthTraining, Wellness]
df = pd.concat(data_frames, join='outer', keys='Date')
df = pd.merge(df, PlayerTrainingData, on=['TrainingID', 'PlayerID'], how='outer')
df.drop(columns='DateTime', inplace=True)

# Address duration variables 

# Here we will assume that your duration strings are in the format 'days hh:mm:ss.ms'
# If this is not the case, you will have to adjust the slice indices accordingly
df['Duration.player'] = df['Duration.y'].str.slice(5, -3)
df['Duration.exercise'] = df['Duration.x'].str.slice(5, -3)
df.drop(columns=['Duration.y', 'Duration.x'], inplace=True)

# Converting duration to total seconds
df['Duration.player'] = pd.to_timedelta(df['Duration.player']).dt.total_seconds()
df['Duration.exercise'] = pd.to_timedelta(df['Duration.exercise']).dt.total_seconds()
df.drop(columns=['DateEndTime', 'DateStartTime'], inplace=True)

# Create missing date rows to complete time series 
all_dates = pd.date_range(start = df['Date'].min(), end = df['Date'].max())
all_players = df['PlayerID'].unique()
all_dates = pd.MultiIndex.from_product([all_dates, all_players], names=['Date', 'PlayerID'])
df.set_index(['Date', 'PlayerID'], inplace=True)
df = df.reindex(all_dates, fill_value=np.nan).reset_index()

# Show missing values
total_na = df.isna().sum().sum()
col_na = df.isna().sum()

print(total_na)
print(col_na)

# You can also use a heatmap to visualize missing values
sns.heatmap(df.isnull(), cbar=False)
plt.show()

# Feature engineering 

# Create lagged versions of numeric variables
numeric_vars = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_vars.remove('PlayerID')
numeric_vars.remove('Injury')
df.set_index(['Date', 'PlayerID'], inplace=True)

for var in numeric_vars:
    for lag in range(1, 4):
        df[f'{var}_lag_{lag}'] = df.groupby('PlayerID')[var].shift(lag)

df.reset_index(inplace=True)

# Rolling sum variables
for var in numeric_vars:
    for roll in [3, 7]:
        df.set_index(['Date', 'PlayerID'], inplace=True)
        df[f'{var}_roll_sum_{roll}'] = df.groupby('PlayerID')[var].rolling(roll+1).sum()
        df.reset_index(inplace=True)

df_final = df.drop(columns=numeric_vars)
df_final = df_final.drop(columns='Injury', errors='ignore')


ParserError: Error tokenizing data. C error: Expected 2 fields in line 4, saw 3
