In [1]:
import pandas as pd
from lib.data_utils import *
from tqdm import tqdm

In [2]:
def convert_to_string(row):
    return f'{row["subtype_name"]} {row["goal"]} {row["accurate"]} {row["is_home_team"]} {row["period"]} {row["minute"]} {row["second"]} {row["x"]} {row["y"]} {row["home_score"]} {row["away_score"]}'

def convert_to_string_label(row):
    return f'{row["subtype_name"]} {row["goal"]} {row["accurate"]} {row["is_home_team"]} {row["time_elapsed"]} {row["x"]} {row["y"]}'

In [3]:
data = []
for dataset_fname in ['data/wyscout/csv/events/Germany.csv', 'data/wyscout/csv/events/France.csv','data/wyscout/csv/events/Italy.csv']:
    df = load_data(dataset_fname)
    df['is_home_team'] = df['team_id'] == df['home_team_id']

    df['home_score'] = (
        ((df.subtype_name == 'free_kick_shot') & (df.goal == 1) & (df.team_id == df.home_team_id)) |
        ((df.subtype_name == 'penalty') & (df.goal == 1) & (df.team_id == df.home_team_id)) |
        ((df.subtype_name == 'shot') & (df.goal == 1) & (df.team_id == df.home_team_id)) |
        ((df.type_name.isin(['others_on_the_ball', 'pass'])) & (df.own_goal == 1) & (df.team_id == df.away_team_id))
        ).cumsum()
    df['home_score'] = df['home_score'] - df['match_id'].map(df.groupby('match_id')['home_score'].min())
    df['away_score'] = (
        ((df.subtype_name == 'free_kick_shot') & (df.goal == 1) & (df.team_id == df.away_team_id)) |
        ((df.subtype_name == 'penalty') & (df.goal == 1) & (df.team_id == df.away_team_id)) |
        ((df.subtype_name == 'shot') & (df.goal == 1) & (df.team_id == df.away_team_id)) |
        ((df.type_name.isin(['others_on_the_ball', 'pass'])) & (df.own_goal == 1) & (df.team_id == df.home_team_id))
        ).cumsum()
    df['away_score'] = df['away_score'] - df['match_id'].map(df.groupby('match_id')['away_score'].min())

    data.append(df[['match_id', 'subtype_name', 'period', 'minute', 'second', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score']])

df = pd.concat(data)
df['goal'] = df['goal'].astype(int)
df['accurate'] = df['accurate'].astype(int)
df['is_home_team'] = df['is_home_team'].astype(int)
df.loc[df.subtype_name == 0, 'subtype_name'] = 'none'

df['time_elapsed'] = (((df['minute'] * 60 + df['second']) - (df['minute'].shift(1) * 60 + df['second'].shift(1))) * (df['period'] == df['period'].shift(1))).clip(0, 100).fillna(0).astype(int)
 
df.shape

(1799586, 13)

In [4]:
out = open('data/llm/events_train.txt', 'w')
for i in tqdm(range(len(df))):
    if i == len(df)-1:
        out.write(convert_to_string(df.iloc[i]) + ' <GAME_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    elif df.iloc[i]['match_id'] != df.iloc[i+1]['match_id']:
        out.write(convert_to_string(df.iloc[i]) + ' <GAME_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    elif df.iloc[i]['period'] != df.iloc[i+1]['period']:
        out.write(convert_to_string(df.iloc[i]) + ' <PERIOD_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    else:
        out.write(convert_to_string(df.iloc[i]) + ' ' + convert_to_string_label(df.iloc[i+1]) + '\n')
out.close()

100%|██████████| 1799586/1799586 [06:26<00:00, 4651.89it/s]


In [17]:
out = open('data/llm/events_train_k3.txt', 'w')
k = 3
for i in tqdm(range(len(df))):
    if i == len(df)-k:
        for j in range(k):
            out.write(convert_to_string(df.iloc[i+j]) + ' ')
        out.write('<GAME_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
        break
    elif df.iloc[i+k-1]['match_id'] != df.iloc[i+k]['match_id']:
        for j in range(k):
            out.write(convert_to_string(df.iloc[i+j]) + ' ')
        out.write('<GAME_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    elif df.iloc[i+k-1]['period'] != df.iloc[i+k]['period']:
        for j in range(k):
            out.write(convert_to_string(df.iloc[i+j]) + ' ')
        out.write('<PERIOD_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    else:
        match_id = df.iloc[i]['match_id']

        string = convert_to_string(df.iloc[i])
        for j in range(k):
            if j == (k-1):
                string += ' ' + convert_to_string_label(df.iloc[i+j+1])
            else:
                string += ' ' + convert_to_string(df.iloc[i+j+1])
            if match_id != df.iloc[i+j+1]['match_id']:
                match_id = df.iloc[i+j+1]['match_id']
                string = ''
                for w in range(j+2):
                    string += '<NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>'
                    if w != j+1:
                        string += ' '
        
        out.write(string + '\n')
out.close()

100%|█████████▉| 1799583/1799586 [14:26<00:00, 2078.03it/s]


In [4]:
data = []
for dataset_fname in ['data/wyscout/csv/events/England.csv', 'data/wyscout/csv/events/Spain.csv']:
    df = load_data(dataset_fname)
    df['is_home_team'] = df['team_id'] == df['home_team_id']

    df['home_score'] = (
        ((df.subtype_name == 'free_kick_shot') & (df.goal == 1) & (df.team_id == df.home_team_id)) |
        ((df.subtype_name == 'penalty') & (df.goal == 1) & (df.team_id == df.home_team_id)) |
        ((df.subtype_name == 'shot') & (df.goal == 1) & (df.team_id == df.home_team_id)) |
        ((df.type_name.isin(['others_on_the_ball', 'pass'])) & (df.own_goal == 1) & (df.team_id == df.away_team_id))
        ).cumsum()
    df['home_score'] = df['home_score'] - df['match_id'].map(df.groupby('match_id')['home_score'].min())
    df['away_score'] = (
        ((df.subtype_name == 'free_kick_shot') & (df.goal == 1) & (df.team_id == df.away_team_id)) |
        ((df.subtype_name == 'penalty') & (df.goal == 1) & (df.team_id == df.away_team_id)) |
        ((df.subtype_name == 'shot') & (df.goal == 1) & (df.team_id == df.away_team_id)) |
        ((df.type_name.isin(['others_on_the_ball', 'pass'])) & (df.own_goal == 1) & (df.team_id == df.home_team_id))
        ).cumsum()
    df['away_score'] = df['away_score'] - df['match_id'].map(df.groupby('match_id')['away_score'].min())

    data.append(df[['match_id', 'subtype_name', 'period', 'minute', 'second', 'x', 'y', 'is_home_team', 'accurate', 'goal', 'home_score', 'away_score']])

df = pd.concat(data)
df['goal'] = df['goal'].astype(int)
df['accurate'] = df['accurate'].astype(int)
df['is_home_team'] = df['is_home_team'].astype(int)
df.loc[df.subtype_name == 0, 'subtype_name'] = 'none'

df['time_elapsed'] = (((df['minute'] * 60 + df['second']) - (df['minute'].shift(1) * 60 + df['second'].shift(1))) * (df['period'] == df['period'].shift(1))).clip(0, 100).fillna(0).astype(int)

df.shape

(1271809, 13)

In [8]:
out = open('data/llm/events_test.txt', 'w')
for i in tqdm(range(len(df))):
    if i == len(df)-1:
        out.write(convert_to_string(df.iloc[i]) + ' <GAME_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    elif df.iloc[i]['match_id'] != df.iloc[i+1]['match_id']:
        out.write(convert_to_string(df.iloc[i]) + ' <GAME_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    elif df.iloc[i]['period'] != df.iloc[i+1]['period']:
        out.write(convert_to_string(df.iloc[i]) + ' <PERIOD_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    else:
        out.write(convert_to_string(df.iloc[i]) + ' ' + convert_to_string_label(df.iloc[i+1]) + '\n')
out.close()

100%|██████████| 1271809/1271809 [05:11<00:00, 4079.94it/s]


In [5]:
out = open('data/llm/events_test_k3.txt', 'w')
k = 3
for i in tqdm(range(len(df))):
    if i == len(df)-k:
        for j in range(k):
            out.write(convert_to_string(df.iloc[i+j]) + ' ')
        out.write('<GAME_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
        break
    elif df.iloc[i+k-1]['match_id'] != df.iloc[i+k]['match_id']:
        for j in range(k):
            out.write(convert_to_string(df.iloc[i+j]) + ' ')
        out.write('<GAME_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    elif df.iloc[i+k-1]['period'] != df.iloc[i+k]['period']:
        for j in range(k):
            out.write(convert_to_string(df.iloc[i+j]) + ' ')
        out.write('<PERIOD_OVER> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>\n')
    else:
        match_id = df.iloc[i]['match_id']

        string = convert_to_string(df.iloc[i])
        for j in range(k):
            if j == (k-1):
                string += ' ' + convert_to_string_label(df.iloc[i+j+1])
            else:
                string += ' ' + convert_to_string(df.iloc[i+j+1])
            if match_id != df.iloc[i+j+1]['match_id']:
                match_id = df.iloc[i+j+1]['match_id']
                string = ''
                for w in range(j+2):
                    string += '<NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN> <NaN>'
                    if w != j+1:
                        string += ' '
        
        out.write(string + '\n')
out.close()

100%|█████████▉| 1271806/1271809 [09:05<00:00, 2329.85it/s]


In [10]:
df = pd.read_csv('data/llm/events_test.txt', sep=' ', header=None, dtype=str)
df = df.fillna('<NaN>')

In [11]:
train_df = []
for i in range(11, 22):
    _df = df.copy()
    _df['target'] = _df[i]

    train_df.append(_df.sample(1000, random_state=42))
    train_df[-1].to_csv(f'data/llm/samples/events_test_k1_{i}.txt', sep=' ', header=False, index=False)

KeyError: 18

In [None]:
df = pd.read_csv('data/llm/events_test_k3.txt', sep=' ', header=None, dtype=str)
df = df.fillna('<NaN>')

In [None]:
for i in range(33, 44):
    _df = df.copy()
    _df['target'] = _df[i]

    _df.loc[train_df[i-33].index].to_csv(f'data/llm/samples/events_test_k3_{i}.txt', sep=' ', header=False, index=False)