<a href="https://colab.research.google.com/github/mioackerman/DS3000-25fall/blob/main/ML_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
from collections import defaultdict
import numpy as np
from datetime import timedelta
import random
import pickle
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import ast
from torch.utils.data import Dataset, DataLoader


In [None]:
# Import data
DRIVE_PATH = '/content/drive/My Drive/OE Data 2020-2025/'

years = range(2020, 2026)
files = [os.path.join(DRIVE_PATH, f"{year}_LoL_esports_match_data_from_OraclesElixir.csv")
        for year in years]

In [None]:
# Create combined dataframe
dfs = []
for file in files:
  df = pd.read_csv(file, usecols = range(30), dtype={2: str})
  dfs.append(df)

df = pd.concat(dfs)

In [None]:
# Select relevant columns
relevant_columns = ['league', 'teamname', 'date', 'patch', 'game', 'side',
                    'ban1', 'ban2', 'ban3', 'ban4', 'ban5',
                    'pick1', 'pick2', 'pick3', 'pick4', 'pick5']
df = df[relevant_columns]
df = df.dropna(subset=['pick4'])
df['date'] = pd.to_datetime(df['date'])
df = df.reset_index(drop=True)

In [None]:
df

Unnamed: 0,league,teamname,date,patch,game,side,ban1,ban2,ban3,ban4,ban5,pick1,pick2,pick3,pick4,pick5
0,KeSPA,Liiv SANDBOX,2020-01-03 07:33:26,9.24,1.0,Blue,LeBlanc,Irelia,Rek'Sai,Yasuo,Renekton,Miss Fortune,Rumble,Nautilus,Qiyana,Elise
1,KeSPA,T1,2020-01-03 07:33:26,9.24,1.0,Red,Lucian,Akali,Lee Sin,Olaf,Jarvan IV,Gragas,Xayah,Rakan,Aatrox,Mordekaiser
2,KeSPA,T1,2020-01-03 09:00:58,9.24,2.0,Blue,Syndra,LeBlanc,Rumble,Braum,Leona,Miss Fortune,Jarvan IV,Nautilus,Orianna,Jayce
3,KeSPA,Liiv SANDBOX,2020-01-03 09:00:58,9.24,2.0,Red,Lucian,Akali,Rek'Sai,Irelia,Camille,Varus,Elise,Aatrox,Tahm Kench,Qiyana
4,KeSPA,T1,2020-01-03 10:05:17,9.24,3.0,Blue,LeBlanc,Rumble,Lee Sin,Gragas,Olaf,Rek'Sai,Xayah,Rakan,Mordekaiser,Renekton
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111173,IC,Ramboot Club,2025-11-13 17:57:49,15.22,2.0,Red,Karma,Vi,Ambessa,Lulu,Naafiri,K'Sante,Corki,LeBlanc,Skarner,Nami
111174,IC,Los Heretics,2025-11-13 19:23:13,15.22,1.0,Blue,Zoe,Rumble,Poppy,Rakan,Neeko,Wukong,Ziggs,Alistar,K'Sante,Azir
111175,IC,Nocturne Gale,2025-11-13 19:23:13,15.22,1.0,Red,Xin Zhao,Ryze,Syndra,Akali,Yone,Corki,Vi,Orianna,Nautilus,Ambessa
111176,IC,Nocturne Gale,2025-11-13 20:14:20,15.22,2.0,Blue,Ryze,Xin Zhao,Gwen,Zed,Trundle,Sion,Kai'Sa,Syndra,Poppy,Rakan


In [None]:
# Combine data from both teams in match
df['game_id'] = df.index // 2

combined_games = []

for game_id, game_data in df.groupby('game_id'):
  blue_data = game_data[game_data['side'] == 'Blue']
  red_data = game_data[game_data['side'] == 'Red']

  if len(game_data) != 2:
    continue
  if blue_data.empty or red_data.empty:
    continue

  blue_team = blue_data.iloc[0]
  red_team = red_data.iloc[0]

  combined_row = {'date': blue_team['date'],
                  'patch': blue_team['patch'],
                  'game': blue_team['game'],
                  'blue_team': str(blue_team['teamname']),
                  'red_team': str(red_team['teamname']),
                  'team_pair': tuple(sorted([str(blue_team['teamname']),
                                             str(red_team['teamname'])]))}

  for i in range(1, 6):
    combined_row[f'blue_pick{i}'] = blue_team[f'pick{i}']
    combined_row[f'red_pick{i}'] = red_team[f'pick{i}']
    combined_row[f'blue_ban{i}'] = blue_team[f'ban{i}']
    combined_row[f'red_ban{i}'] = red_team[f'ban{i}']

  combined_games.append(combined_row)

df = pd.DataFrame(combined_games)
df = df.dropna(axis=0, how="any")

In [None]:
# Relabel significantly reworked champions
rework_patches = {'Tahm Kench': 11.13,
                  'Corki': 14.10,
                  'Skarner': 14.7}

champion_columns = [col for col in df.columns if 'pick' in col or 'ban' in col]


for champ, rework_patch in rework_patches.items():
  for col in champion_columns:
    mask = (df[col] == champ) & (df['patch'] < rework_patch)
    df.loc[mask, col] = f'Old_{champ}'

In [None]:
'''
Assign match IDs to games.
Games are part of the same match if:
1. Same teams
2. Within 6 hours of each other
3. No champion picked in previous game is picked/banned again
'''
# Sorted this way to keep games within match adjacent
df = df.sort_values(['team_pair', 'date']).reset_index(drop=True)

match_ids = []
current_match_id = 1
current_series_picks = set()

for i in range(len(df)):
  fearless_violation = False
  if i == 0:
    match_ids.append(current_match_id)
    for col in df.columns:
      if 'pick' in col:
        current_series_picks.add(df.loc[0][col])
    continue

  previous_row = df.loc[i-1]
  current_row = df.iloc[i]

  same_teams = current_row['team_pair'] == previous_row['team_pair']
  within_time = (current_row['date'] - previous_row['date']) <= timedelta(hours=6)

  for col in current_row.index:
    if ('pick' in col or 'ban' in col):
      if current_row[col] in current_series_picks:
        fearless_violation = True

  if not same_teams or not within_time or fearless_violation:
    current_match_id += 1
    current_series_picks = set()

  match_ids.append(current_match_id)

  for col in current_row.index:
    if 'pick' in col:
      current_series_picks.add(current_row[col])

df['match_id'] = match_ids
df['game_number'] = df.groupby('match_id').cumcount() + 1


In [None]:
# Define pick order
phase_order = [
    (1,  [('ban',  'blue', 'blue_ban1')]),
    (2,  [('ban',  'red',  'red_ban1')]),
    (3,  [('ban',  'blue', 'blue_ban2')]),
    (4,  [('ban',  'red',  'red_ban2')]),
    (5,  [('ban',  'blue', 'blue_ban3')]),
    (6,  [('ban',  'red',  'red_ban3')]),
    (7,  [('pick', 'blue', 'blue_pick1')]),
    (8,  [('pick', 'red', 'red_pick1'),
          ('pick', 'red', 'red_pick2')]),
    (9,  [('pick', 'blue', 'blue_pick2'),
          ('pick', 'blue', 'blue_pick3')]),
    (10, [('pick', 'red',  'red_pick3')]),
    (11, [('ban',  'red',  'red_ban4')]),
    (12, [('ban',  'blue', 'blue_ban4')]),
    (13, [('ban',  'red',  'red_ban5')]),
    (14, [('ban',  'blue', 'blue_ban5')]),
    (15, [('pick', 'red',  'red_pick4')]),
    (16, [('pick', 'blue', 'blue_pick4'),
          ('pick', 'blue', 'blue_pick5')]),
    (17, [('pick', 'red',  'red_pick5')])]

In [None]:
# Create training data
training_rows = []
pick_cols = [c for c in df.columns if 'pick' in c]
df_sorted = df.sort_values(['match_id', 'game_number'])

for match_id, match_games in df_sorted.groupby('match_id'):
    previous_game_picks = set()
    match_games = match_games.sort_values('game_number')

    for game in match_games.itertuples(index=False):
        ban_space = set(previous_game_picks)
        blue_pick_space = set()
        red_pick_space = set()

        for phase_num, actions in phase_order:

            # Case: two simultaneous picks
            if len(actions) == 2 and all(a[0] == 'pick' for a in actions):
                (atype1, team1, col1), (atype2, team2, col2) = actions

                champ1 = getattr(game, col1)
                champ2 = getattr(game, col2)

                base_blue = set(blue_pick_space)
                base_red = set(red_pick_space)
                base_ban = set(ban_space)

                orders = [
                    (champ1, team1, col1, champ2, team2, col2),
                    (champ2, team2, col2, champ1, team1, col1),
                ]

                for fchamp, fteam, fcol, schamp, steam, scol in orders:
                    cur_blue = set(base_blue)
                    cur_red = set(base_red)

                    training_rows.append({
                        'match_id': game.match_id,
                        'patch': game.patch,
                        'blue_team': game.blue_team,
                        'red_team': game.red_team,
                        'blue_pick_space': tuple(sorted(cur_blue)),
                        'red_pick_space': tuple(sorted(cur_red)),
                        'ban_space': tuple(sorted(base_ban)),
                        'action_type': 'pick',
                        'acting_team': fteam,
                        'target': fchamp
                    })

                    if fteam == 'blue':
                        cur_blue.add(fchamp)
                    else:
                        cur_red.add(fchamp)

                    training_rows.append({
                        'match_id': game.match_id,
                        'patch': game.patch,
                        'blue_team': game.blue_team,
                        'red_team': game.red_team,
                        'blue_pick_space': tuple(sorted(cur_blue)),
                        'red_pick_space': tuple(sorted(cur_red)),
                        'ban_space': tuple(sorted(base_ban)),
                        'action_type': 'pick',
                        'acting_team': steam,
                        'target': schamp
                    })

                # update real pick state
                if team1 == 'blue':
                    blue_pick_space.update([champ1, champ2])
                else:
                    red_pick_space.update([champ1, champ2])

            else:
                # single action
                for action_type, team, column in actions:
                    champion = getattr(game, column)

                    training_rows.append({
                        'match_id': game.match_id,
                        'patch': game.patch,
                        'blue_team': game.blue_team,
                        'red_team': game.red_team,
                        'blue_pick_space': tuple(sorted(blue_pick_space)),
                        'red_pick_space': tuple(sorted(red_pick_space)),
                        'ban_space': tuple(sorted(ban_space)),
                        'action_type': action_type,
                        'acting_team': team,
                        'target': champion
                    })

                    if action_type == 'ban':
                        ban_space.add(champion)
                    else:
                        if team == 'blue':
                            blue_pick_space.add(champion)
                        else:
                            red_pick_space.add(champion)

        for col in pick_cols:
            previous_game_picks.add(getattr(game, col))

In [None]:
file_path = '/content/drive/My Drive/training_data.pkl'
with open(file_path, 'wb') as f:
    pickle.dump(training_rows, f)

In [None]:
match_count = df["match_id"].nunique()
match_df = []

for i in range(1, match_count + 1):
    match_df.append({
      "match_id": i,
      "patch": df.loc[df["match_id"] == i]["patch"].iloc[0],
      "blue_team": df.loc[df["match_id"] == i]["blue_team"].iloc[0],
      "red_team": df.loc[df["match_id"] == i]["red_team"].iloc[0],
      "champion_seq": []
    })
    gamecount = df.loc[df["match_id"] == i]["game_number"].nunique()

    for j in range(1, gamecount + 1):
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_ban1"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_ban1"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_ban2"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_ban2"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_ban3"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_ban3"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_pick1"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_pick1"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_pick2"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_pick2"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_pick3"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_pick3"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_ban4"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_ban4"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_ban5"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_ban5"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_pick4"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_pick4"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["blue_pick5"].iloc[0])
       match_df[i -1]["champion_seq"].append(df.loc[(df["match_id"] == i) & (df["match_id"] == i) & (df["game_number"] == j)]["red_pick5"].iloc[0])


In [None]:
file_path = '/content/drive/My Drive/training_data_sequential.pkl'
with open(file_path, 'wb') as f:
    pickle.dump(match_df, f)

In [None]:
match_df

In [None]:
df.loc[df["blue_team"] == i]["blue_team"]

1