# Generate student sports data

In [495]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# For testing our work
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [496]:
###############################################################################
#                                                                             #
# Settings. By all means change these.                                        #
#                                                                             #
###############################################################################

num_years = 3 # How many years to generate data for?

first_date = '2022-01-01' # What is the first date to generate (yyyy-mm-dd)

holiday_weeks = [52, 1, 8, 18, 42, 27, 28, 29, 30, 31, 32] # Weeks that are considered to be holidays.

first_school_week = 35

exam_weeks = [4, 8] # Weeks that are considered to be exam weeks

week_effect_coefficient = 0.4 # Strength of the weekly effect (school weeks at the start of a block have higher attendance than subsequent weeks)

ny_resolution_coefficient = 0.04 # Strength of the new years resolution effect (higher attendance in january, drops to 0 in june)

hourly_effect_coefficient = 0.2 # Strength of the hourly effect (hourly attendance increases as the day progresses, except during exam weeks)

holiday_multiplier = 0.5 # Multiplier for holiday weeks

baseline_attendance = 100.0 # Should be a float!!

hour_first = 10 # What is the first hour the gyms are open?
hour_last = 21 # What is the last hour the gyms are open?


## Generate dates

Include info on holidays and weekdays

In [497]:
df = pd.DataFrame()
df['date'] = pd.date_range(start=first_date, periods=365 * num_years, freq="D")
df['weekday'] = df['date'].dt.weekday
df['weeknum'] = df['date'].dt.isocalendar().week
df['is_holiday'] = df['weeknum'].map(lambda x : True if x in holiday_weeks else False )
df['is_weekday'] = df['weekday'].map(lambda x : True if x in [5, 6] else False)


## Generate school weeks


In [498]:
# Generate a map of week numbers to school weeks
week_school_week_map = {}
for weeknum in range (1, 53):
    week_school_week_map[weeknum] = 0
current_week = first_school_week - 1
current_school_week = 1
while True:
    current_week += 1
    if(current_week in holiday_weeks):
        continue

    week_school_week_map[current_week] = current_school_week
    current_school_week += 1
    if current_school_week > 10:
        current_school_week = 1
    if current_week > 52:
        current_week = 1
    if current_week == first_school_week - 1:
        break
# Hack: it's easier to do this manually (weeks before the start of the schoolyear)
week_school_week_map[33] = 0
week_school_week_map[34] = 0
# And this:
week_school_week_map[53] = 0
week_school_week_map[2] = 7
week_school_week_map[3] = 8
week_school_week_map[4] = 9

df['weeknum'] = df['weeknum'].map(int)
df['school_week'] = df['weeknum'].map(lambda x : week_school_week_map[x + 1])
df.loc[df['is_holiday'], 'school_week'] = 0







## Generate exam info

In [499]:
# Generate exam weeks
df['exam_week'] = df.apply(lambda row : True if(row['school_week'] in exam_weeks) else False , axis = 1)


# Generate attendance

Do this for one gym and then copy the data over to multiple gyms (if desired)

In [500]:
df['attendance'] = baseline_attendance

In [501]:
# Generate a linear effect for school weeks where attendance gradually drops as the schoolweek number increases,
# Only do this for non-exam weeks.

df['attendance_week_effect'] = 0.0
df.loc[df['exam_week'] == False, 'attendance_week_effect'] = ((10 - df['school_week']) / 10) * week_effect_coefficient

print("Values for weekly effect (excluding exam weeks):")
df['attendance_week_effect'].unique()



Values for weekly effect (excluding exam weeks):


array([0.4 , 0.  , 0.04, 0.36, 0.32, 0.28, 0.2 , 0.16, 0.12])

In [502]:
# Generate a new year's resolution effect: attendance is highest in january (in non-exam weeks). This effect drops to 0 over 26 weeks
df['attendance_ny_effect'] = 0.0
df['ny_mult'] = (abs(df['weeknum'] - 53) - 26)
df['ny_mult'] = df['ny_mult'] * df['ny_mult'] / 26
df.loc[(df['exam_week'] == False) & (df['weeknum'] < 26), 'attendance_ny_effect'] = ny_resolution_coefficient * df['ny_mult']

print("Values for New Years' Resolution effect (excluding exam weeks):")
df['attendance_ny_effect'].unique()


Values for New Years' Resolution effect (excluding exam weeks):


array([0.        , 1.04      , 0.88615385, 0.81384615, 0.74461538,
       0.67846154, 0.61538462, 0.55538462, 0.49846154, 0.44461538,
       0.39384615, 0.30153846, 0.26      , 0.22153846, 0.18615385,
       0.15384615, 0.12461538, 0.07538462, 0.05538462, 0.03846154,
       0.01384615, 0.00615385])

In [503]:
# TODO TEMP: trim the dataset to make experimenting easier
df = df[(df['date'] >= '2022-01-31') & (df['date'] <= '2022-02-03')]
df

Unnamed: 0,date,weekday,weeknum,is_holiday,is_weekday,school_week,exam_week,attendance,attendance_week_effect,attendance_ny_effect,ny_mult
30,2022-01-31,0,5,False,False,2,False,100.0,0.32,0.744615,18.615385
31,2022-02-01,1,5,False,False,2,False,100.0,0.32,0.744615,18.615385
32,2022-02-02,2,5,False,False,2,False,100.0,0.32,0.744615,18.615385
33,2022-02-03,3,5,False,False,2,False,100.0,0.32,0.744615,18.615385


In [504]:
# Generate hourly data
pd.options.mode.chained_assignment = None
df['hour'] = hour_first

# It turns out copying the whole mess to a regular array and updating that is much, much easier and more efficient than letting pandas handle this.
new_rows = []
for date in df['date'].unique():
    row = df[df['date'] == date].iloc[0]
    new_rows.append(row)
    for hour in range(hour_first + 1, hour_last + 1):
        new_row = row.copy(deep=True)
        # print(f"Adding hour {hour} to date {new_row['date']}")
        new_row.at['hour'] = hour
        new_rows.append(new_row)
pd.options.mode.chained_assignment = "warn"
df = pd.DataFrame(new_rows)



In [505]:
# Generate a linear effect, increasing attendance every hour except during exam weeks
#hourly_effect_coefficient
df['attendance_hourly_effect'] = 0.0
df.loc[df['exam_week'] == False, 'attendance_hourly_effect'] = (24 / df['hour']) * hourly_effect_coefficient
print("Values for hourly effect (excluding exam weeks):")
df['attendance_hourly_effect'].unique()

Values for hourly effect (excluding exam weeks):


array([0.48      , 0.43636364, 0.4       , 0.36923077, 0.34285714,
       0.32      , 0.3       , 0.28235294, 0.26666667, 0.25263158,
       0.24      , 0.22857143])

In [506]:
# Generate holiday multiplier
df['holiday_multiplier'] = 1.0
df.loc[df['is_holiday'], 'holiday_multiplier'] = holiday_multiplier

In [None]:
# Now generate the final attendance
# df['']

In [None]:

# TODO apply a linear effect to hourly data, but only for non-exam weeks
# TODO halve the attendance during holidays
# TODO add a bit of noise
# TODO set school week to NaN when it is 0