In [18]:
import os
import json
import pandas as pd

# folder path
match_details_folder = "../data/raw/match_details"

# only want to grab json files with over 20MB
size_threshold = 20 * 1024 * 1024  # 20MB in bytes

all_match_data = []

# Iterate over files in the match details folder
for filename in os.listdir(match_details_folder):
    filepath = os.path.join(match_details_folder, filename)
    
    # Check if the file is a JSON file and if its size exceeds the threshold
    if filename.endswith(".json") and os.path.getsize(filepath) > size_threshold:
        # Load the JSON data
        with open(filepath, "r") as f:
            match_data = json.load(f)
            all_match_data.extend(match_data)  

df = pd.DataFrame(all_match_data)

print(df.head())

          matchId  gameDuration  championId championName  teamId  \
0  NA1_5151182315          1835          98         Shen     100   
1  NA1_5151182315          1835         104       Graves     100   
2  NA1_5151182315          1835         161       Velkoz     100   
3  NA1_5151182315          1835         147    Seraphine     100   
4  NA1_5151182315          1835          43        Karma     100   

  individualPosition  kills  deaths  assists    win  ...  item_1  \
0                TOP      3       5        9  False  ...    3748   
1             JUNGLE     17       4       10  False  ...    6676   
2             MIDDLE      7       6        7  False  ...    3158   
3             BOTTOM      3       5       16  False  ...    2503   
4            UTILITY      4       7       13  False  ...    6653   

   item_purchase_time_1  item_2  item_purchase_time_2 item_3  \
0                   882    3066                   885   3158   
1                   469    3033                  1216 

In [20]:
df['championName'].value_counts()

championName
Caitlyn         1929
Graves          1683
Jhin            1501
Ashe            1475
Corki           1421
                ... 
Nilah            157
Trundle          137
Heimerdinger     116
Rammus           106
Quinn            106
Name: count, Length: 169, dtype: int64

In [22]:
df.dtypes

matchId                 object
gameDuration             int64
championId               int64
championName            object
teamId                   int64
individualPosition      object
kills                    int64
deaths                   int64
assists                  int64
win                       bool
goldEarned               int64
totalDamageDealt         int64
totalDamageTaken         int64
totalHeal                int64
matchupChampion         object
primaryRune             object
secondaryRune           object
item_0                   int64
item_purchase_time_0    object
item_1                   int64
item_purchase_time_1    object
item_2                   int64
item_purchase_time_2    object
item_3                   int64
item_purchase_time_3    object
item_4                   int64
item_purchase_time_4    object
item_5                   int64
item_purchase_time_5    object
dtype: object

In [10]:
df.columns

Index(['matchId', 'gameDuration', 'championId', 'championName', 'teamId',
       'individualPosition', 'kills', 'deaths', 'assists', 'win', 'goldEarned',
       'totalDamageDealt', 'totalDamageTaken', 'totalHeal', 'matchupChampion',
       'primaryRune', 'secondaryRune', 'item_0', 'item_purchase_time_0',
       'item_1', 'item_purchase_time_1', 'item_2', 'item_purchase_time_2',
       'item_3', 'item_purchase_time_3', 'item_4', 'item_purchase_time_4',
       'item_5', 'item_purchase_time_5'],
      dtype='object')

In [24]:
df[['championName', 'matchupChampion']]

Unnamed: 0,championName,matchupChampion
0,Shen,Nasus
1,Graves,Udyr
2,Velkoz,Syndra
3,Seraphine,Samira
4,Karma,Nami
...,...,...
89995,Chogath,Ornn
89996,Lillia,Shaco
89997,Yone,Ahri
89998,Ashe,Caitlyn


In [26]:
df.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,item_1,item_purchase_time_1,item_2,item_purchase_time_2,item_3,item_purchase_time_3,item_4,item_purchase_time_4,item_5,item_purchase_time_5
0,NA1_5151182315,1835,98,Shen,100,TOP,3,5,9,False,...,3748,882,3066,885,3158,446,3076,1620,1031,1818
1,NA1_5151182315,1835,104,Graves,100,JUNGLE,17,4,10,False,...,6676,469,3033,1216,3031,1605,3009,604,3082,1753
2,NA1_5151182315,1835,161,Velkoz,100,MIDDLE,7,6,7,False,...,3158,1229,6655,736,3089,1689,4645,1033,0,Unknown Time
3,NA1_5151182315,1835,147,Seraphine,100,BOTTOM,3,5,16,False,...,2503,847,3040,Unknown Time,3158,1255,3147,1468,1052,1578
4,NA1_5151182315,1835,43,Karma,100,UTILITY,4,7,13,False,...,6653,765,3111,921,3118,1225,1058,1678,1058,Unknown Time


In [37]:
new_df = df

## Need to do handling for unknown times on core items (Legendary, Boots, Starter)

In [53]:
# Load item data to get item costs
with open("../data/raw/item_data/items.json", 'r') as f:
    items_data = json.load(f)
    items = items_data["data"]

# Function to get item cost
def get_item_cost(item_id):
    item_info = items.get(str(item_id), {})
    return item_info.get("gold", {}).get("total", 0)

# Iterate through each row to estimate missing purchase times
for index, row in new_df.iterrows():
    for i in range(6):
        item_col = f"item_{i}"
        time_col = f"item_purchase_time_{i}"

        item_id = row[item_col]
        purchase_time = row[time_col]
        item_cost = get_item_cost(item_id)

        # Fill "Unknown Time" for important items (cost > 1500 or boots or starter items)
        if purchase_time == "Unknown Time" and (item_cost > 2100 or item_id in [1001, 2422, 3111, 3158, 3006, 3020] or item_id in [1054, 1055, 1056]):
            # Get the previous known purchase times
            previous_times = [int(row[f"item_purchase_time_{j}"]) for j in range(i) if row[f"item_purchase_time_{j}"] != "Unknown Time"]
            
            # Estimate purchase time based on the previous purchases
            if len(previous_times) > 0:
                estimated_time = max(previous_times) + (item_cost // 100) * 30  # Estimate based on item cost
            else:
                estimated_time = 900  # Default to 15 minutes if no prior times are available

            # Update the DataFrame with the estimated time
            new_df.at[index, time_col] = estimated_time

print("Missing item purchase times have been estimated and updated.")

Missing item purchase times have been estimated and updated.


In [55]:
new_df.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,item_1,item_purchase_time_1,item_2,item_purchase_time_2,item_3,item_purchase_time_3,item_4,item_purchase_time_4,item_5,item_purchase_time_5
0,NA1_5151182315,1835,98,Shen,100,TOP,3,5,9,False,...,3748,882,3066,885,3158,446,3076,1620,1031,1818
1,NA1_5151182315,1835,104,Graves,100,JUNGLE,17,4,10,False,...,6676,469,3033,1216,3031,1605,3009,604,3082,1753
2,NA1_5151182315,1835,161,Velkoz,100,MIDDLE,7,6,7,False,...,3158,1229,6655,736,3089,1689,4645,1033,0,Unknown Time
3,NA1_5151182315,1835,147,Seraphine,100,BOTTOM,3,5,16,False,...,2503,847,3040,1717,3158,1255,3147,1468,1052,1578
4,NA1_5151182315,1835,43,Karma,100,UTILITY,4,7,13,False,...,6653,765,3111,921,3118,1225,1058,1678,1058,Unknown Time


In [81]:
# Define item IDs
boot_ids = [1001, 3009, 3111, 3158, 3006, 3020]

# Create a new DataFrame for holding only the essential items
filtered_df = new_df.copy()

# Function to find boots and two earliest legendary items (over 2100 gold)
def filter_important_items(row):
    items_with_time = []
    boots, legendary_items = None, []

    for i in range(6):
        item_col = f"item_{i}"
        time_col = f"item_purchase_time_{i}"

        item_id = row[item_col]
        item_purchase_time = row[time_col]
        item_cost = get_item_cost(item_id)

        # Boots
        if item_id in boot_ids:
            boots = (item_id, item_purchase_time)

        # Legendary items
        if item_cost > 2100:
            items_with_time.append((item_id, item_purchase_time))

    # Sort legendary items by purchase time to find the first two
    items_with_time = sorted(items_with_time, key=lambda x: int(x[1]) if isinstance(x[1], int) or str(x[1]).isdigit() else float('inf'))
    legendary_items = items_with_time[:2]

    # Updating the filtered DataFrame
    filtered_items = ([boots] if boots else []) + legendary_items
    return {
        "Boots": boots if boots else (None, None),
        "Legendary_1": legendary_items[0] if len(legendary_items) > 0 else (None, None),
        "Legendary_2": legendary_items[1] if len(legendary_items) > 1 else (None, None),
    }

# Apply the filter function to each row
filtered_items_dicts = filtered_df.apply(filter_important_items, axis=1)

# Extract the filtered items to new columns in the filtered DataFrame
important_items = ["Boots", "Legendary_1", "Legendary_2"]
for important_item in important_items:
    filtered_df[f"{important_item}_id"] = filtered_items_dicts.apply(lambda x: x[important_item][0])
    filtered_df[f"{important_item}_purchase_time"] = filtered_items_dicts.apply(lambda x: x[important_item][1])

# Drop original item columns
filtered_df.drop(columns=[f"item_{i}" for i in range(6)] + [f"item_purchase_time_{i}" for i in range(6)], inplace=True)

print("Filtered DataFrame with only the important items is ready.")

Filtered DataFrame with only the important items is ready.


In [83]:
filtered_df.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,totalHeal,matchupChampion,primaryRune,secondaryRune,Boots_id,Boots_purchase_time,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time
0,NA1_5151182315,1835,98,Shen,100,TOP,3,5,9,False,...,3710,Nasus,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",3158.0,446.0,3748.0,882.0,2502.0,1365.0
1,NA1_5151182315,1835,104,Graves,100,JUNGLE,17,4,10,False,...,12540,Udyr,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",3009.0,604.0,6676.0,469.0,6673.0,906.0
2,NA1_5151182315,1835,161,Velkoz,100,MIDDLE,7,6,7,False,...,1824,Syndra,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",3158.0,1229.0,6655.0,736.0,4645.0,1033.0
3,NA1_5151182315,1835,147,Seraphine,100,BOTTOM,3,5,16,False,...,3550,Samira,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",3158.0,1255.0,2503.0,847.0,3040.0,1717.0
4,NA1_5151182315,1835,43,Karma,100,UTILITY,4,7,13,False,...,1719,Nami,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",3111.0,921.0,6653.0,765.0,3118.0,1225.0


In [85]:
filtered_df.dtypes

matchId                       object
gameDuration                   int64
championId                     int64
championName                  object
teamId                         int64
individualPosition            object
kills                          int64
deaths                         int64
assists                        int64
win                             bool
goldEarned                     int64
totalDamageDealt               int64
totalDamageTaken               int64
totalHeal                      int64
matchupChampion               object
primaryRune                   object
secondaryRune                 object
Boots_id                     float64
Boots_purchase_time          float64
Legendary_1_id               float64
Legendary_1_purchase_time    float64
Legendary_2_id               float64
Legendary_2_purchase_time    float64
dtype: object

In [95]:
df['secondaryRune'].head()

0       {'description': 'subStyle', 'selections': [{'perk': 8106, 'var1': 5, 'var2': 31, 'var3': 0}, {'perk': 8126, 'var1': 773, 'var2': 0, 'var3': 0}], 'style': 8100}
1    {'description': 'subStyle', 'selections': [{'perk': 9111, 'var1': 1954, 'var2': 540, 'var3': 0}, {'perk': 9104, 'var1': 8, 'var2': 30, 'var3': 0}], 'style': 8000}
2       {'description': 'subStyle', 'selections': [{'perk': 8126, 'var1': 468, 'var2': 0, 'var3': 0}, {'perk': 8106, 'var1': 5, 'var2': 31, 'var3': 0}], 'style': 8100}
3     {'description': 'subStyle', 'selections': [{'perk': 9105, 'var1': 16, 'var2': 10, 'var3': 0}, {'perk': 8009, 'var1': 5235, 'var2': 0, 'var3': 0}], 'style': 8000}
4         {'description': 'subStyle', 'selections': [{'perk': 8304, 'var1': 9, 'var2': 0, 'var3': 0}, {'perk': 8410, 'var1': 65, 'var2': 0, 'var3': 0}], 'style': 8300}
Name: secondaryRune, dtype: object

In [99]:
# Load the runes data from the JSON file
with open("../data/raw/runes_data/runes.json", "r") as f:
    runes_data = json.load(f)

# Create a dictionary to map rune IDs to their names
rune_dict = {}
for rune_tree in runes_data:
    for slot in rune_tree["slots"]:
        for rune in slot["runes"]:
            rune_dict[rune["id"]] = rune["name"]

import ast  # To parse string representations of dictionaries if needed

# Function to clean up rune data by mapping rune IDs to their names
def clean_rune_data(rune_column, rune_dict):
    readable_runes = []

    # Iterate through the data in the column
    for row in rune_column:
        # Convert the string representation to a dictionary if necessary
        if isinstance(row, str):
            row = ast.literal_eval(row)

        # Extract rune information
        rune_style = rune_dict.get(row['style'], "Unknown Style")
        selections = row.get('selections', [])
        selection_names = [
            rune_dict.get(selection['perk'], "Unknown Rune") for selection in selections
        ]

        # Create a cleaned-up representation of the runes
        readable_runes.append({
            "style": rune_style,
            "selections": selection_names
        })

    return readable_runes

# Clean up the primaryRune and secondaryRune columns
filtered_df["primaryRune_clean"] = clean_rune_data(filtered_df["primaryRune"], rune_dict)
filtered_df["secondaryRune_clean"] = clean_rune_data(filtered_df["secondaryRune"], rune_dict)

# Drop original columns if no longer needed
filtered_df.drop(columns=["primaryRune", "secondaryRune"], inplace=True)

# Display a few rows to see the cleaned data
print(filtered_df[["primaryRune_clean", "secondaryRune_clean"]].head())

                                                                                                        primaryRune_clean  \
0          {'style': 'Unknown Style', 'selections': ['Grasp of the Undying', 'Shield Bash', 'Second Wind', 'Revitalize']}   
1  {'style': 'Unknown Style', 'selections': ['Dark Harvest', 'Sudden Impact', 'Eyeball Collection', 'Relentless Hunter']}   
2                  {'style': 'Unknown Style', 'selections': ['Arcane Comet', 'Manaflow Band', 'Transcendence', 'Scorch']}   
3          {'style': 'Unknown Style', 'selections': ['Summon Aery', 'Manaflow Band', 'Transcendence', 'Gathering Storm']}   
4              {'style': 'Unknown Style', 'selections': ['Arcane Comet', 'Manaflow Band', 'Celerity', 'Gathering Storm']}   

                                                                   secondaryRune_clean  
0          {'style': 'Unknown Style', 'selections': ['Ultimate Hunter', 'Cheap Shot']}  
1            {'style': 'Unknown Style', 'selections': ['Triumph', 'Lege

In [101]:
filtered_df

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,totalHeal,matchupChampion,Boots_id,Boots_purchase_time,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,primaryRune_clean,secondaryRune_clean
0,NA1_5151182315,1835,98,Shen,100,TOP,3,5,9,False,...,3710,Nasus,3158.0,446.0,3748.0,882.0,2502.0,1365.0,"{'style': 'Unknown Style', 'selections': ['Grasp of the Undying', 'Shield Bash', 'Second Wind', 'Revitalize']}","{'style': 'Unknown Style', 'selections': ['Ultimate Hunter', 'Cheap Shot']}"
1,NA1_5151182315,1835,104,Graves,100,JUNGLE,17,4,10,False,...,12540,Udyr,3009.0,604.0,6676.0,469.0,6673.0,906.0,"{'style': 'Unknown Style', 'selections': ['Dark Harvest', 'Sudden Impact', 'Eyeball Collection', 'Relentless Hunter']}","{'style': 'Unknown Style', 'selections': ['Triumph', 'Legend: Alacrity']}"
2,NA1_5151182315,1835,161,Velkoz,100,MIDDLE,7,6,7,False,...,1824,Syndra,3158.0,1229.0,6655.0,736.0,4645.0,1033.0,"{'style': 'Unknown Style', 'selections': ['Arcane Comet', 'Manaflow Band', 'Transcendence', 'Scorch']}","{'style': 'Unknown Style', 'selections': ['Cheap Shot', 'Ultimate Hunter']}"
3,NA1_5151182315,1835,147,Seraphine,100,BOTTOM,3,5,16,False,...,3550,Samira,3158.0,1255.0,2503.0,847.0,3040.0,1717.0,"{'style': 'Unknown Style', 'selections': ['Summon Aery', 'Manaflow Band', 'Transcendence', 'Gathering Storm']}","{'style': 'Unknown Style', 'selections': ['Legend: Haste', 'Presence of Mind']}"
4,NA1_5151182315,1835,43,Karma,100,UTILITY,4,7,13,False,...,1719,Nami,3111.0,921.0,6653.0,765.0,3118.0,1225.0,"{'style': 'Unknown Style', 'selections': ['Arcane Comet', 'Manaflow Band', 'Celerity', 'Gathering Storm']}","{'style': 'Unknown Style', 'selections': ['Magical Footwear', 'Approach Velocity']}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,NA1_5157346215,1540,31,Chogath,200,TOP,2,9,0,False,...,3714,Ornn,,,3084.0,828.0,6664.0,1250.0,"{'style': 'Unknown Style', 'selections': ['Grasp of the Undying', 'Demolish', 'Conditioning', 'Overgrowth']}","{'style': 'Unknown Style', 'selections': ['Legend: Alacrity', 'Cut Down']}"
89996,NA1_5157346215,1540,876,Lillia,200,JUNGLE,4,4,0,False,...,17700,Shaco,3158.0,1227.0,6653.0,719.0,3116.0,1057.0,"{'style': 'Unknown Style', 'selections': ['Conqueror', 'Triumph', 'Legend: Haste', 'Cut Down']}","{'style': 'Unknown Style', 'selections': ['Magical Footwear', 'Cosmic Insight']}"
89997,NA1_5157346215,1540,777,Yone,200,MIDDLE,0,9,2,False,...,5663,Ahri,3006.0,418.0,3153.0,906.0,6673.0,1395.0,"{'style': 'Unknown Style', 'selections': ['Fleet Footwork', 'Absorb Life', 'Legend: Alacrity', 'Cut Down']}","{'style': 'Unknown Style', 'selections': ['Second Wind', 'Shield Bash']}"
89998,NA1_5157346215,1540,22,Ashe,200,BOTTOM,7,5,3,False,...,2352,Caitlyn,3006.0,292.0,6672.0,682.0,3046.0,1073.0,"{'style': 'Unknown Style', 'selections': ['Lethal Tempo', 'Triumph', 'Legend: Alacrity', 'Coup de Grace']}","{'style': 'Unknown Style', 'selections': ['Biscuit Delivery', 'Approach Velocity']}"


In [103]:
runes_fix_df = filtered_df.copy()

# Function to split the primary and secondary rune data into individual columns
def split_rune_data(row):
    primary = row['primaryRune_clean']['selections']
    secondary = row['secondaryRune_clean']['selections']
    
    # Ensure we have the correct number of runes (handling possible missing entries)
    keystone = primary[0] if len(primary) > 0 else None
    primary_slot_1 = primary[1] if len(primary) > 1 else None
    primary_slot_2 = primary[2] if len(primary) > 2 else None
    primary_slot_3 = primary[3] if len(primary) > 3 else None
    secondary_slot_1 = secondary[0] if len(secondary) > 0 else None
    secondary_slot_2 = secondary[1] if len(secondary) > 1 else None

    return pd.Series([keystone, primary_slot_1, primary_slot_2, primary_slot_3, secondary_slot_1, secondary_slot_2])

# Apply the function to split the rune data into new columns
rune_columns = runes_fix_df.apply(split_rune_data, axis=1)

# Rename the new columns
rune_columns.columns = ['Keystone', 'PrimarySlot1', 'PrimarySlot2', 'PrimarySlot3', 'SecondarySlot1', 'SecondarySlot2']

# Concatenate the new columns with the original DataFrame
runes_fix_df = pd.concat([runes_fix_df, rune_columns], axis=1)

# Drop the original rune columns if no longer needed
runes_fix_df.drop(columns=['primaryRune_clean', 'secondaryRune_clean'], inplace=True)

# View the updated DataFrame
print(runes_fix_df)

              matchId  gameDuration  championId championName  teamId  \
0      NA1_5151182315          1835          98         Shen     100   
1      NA1_5151182315          1835         104       Graves     100   
2      NA1_5151182315          1835         161       Velkoz     100   
3      NA1_5151182315          1835         147    Seraphine     100   
4      NA1_5151182315          1835          43        Karma     100   
...               ...           ...         ...          ...     ...   
89995  NA1_5157346215          1540          31      Chogath     200   
89996  NA1_5157346215          1540         876       Lillia     200   
89997  NA1_5157346215          1540         777         Yone     200   
89998  NA1_5157346215          1540          22         Ashe     200   
89999  NA1_5157346215          1540         147    Seraphine     200   

      individualPosition  kills  deaths  assists    win  ...  Legendary_1_id  \
0                    TOP      3       5        9  False

In [105]:
runes_fix_df.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,NA1_5151182315,1835,98,Shen,100,TOP,3,5,9,False,...,3748.0,882.0,2502.0,1365.0,Grasp of the Undying,Shield Bash,Second Wind,Revitalize,Ultimate Hunter,Cheap Shot
1,NA1_5151182315,1835,104,Graves,100,JUNGLE,17,4,10,False,...,6676.0,469.0,6673.0,906.0,Dark Harvest,Sudden Impact,Eyeball Collection,Relentless Hunter,Triumph,Legend: Alacrity
2,NA1_5151182315,1835,161,Velkoz,100,MIDDLE,7,6,7,False,...,6655.0,736.0,4645.0,1033.0,Arcane Comet,Manaflow Band,Transcendence,Scorch,Cheap Shot,Ultimate Hunter
3,NA1_5151182315,1835,147,Seraphine,100,BOTTOM,3,5,16,False,...,2503.0,847.0,3040.0,1717.0,Summon Aery,Manaflow Band,Transcendence,Gathering Storm,Legend: Haste,Presence of Mind
4,NA1_5151182315,1835,43,Karma,100,UTILITY,4,7,13,False,...,6653.0,765.0,3118.0,1225.0,Arcane Comet,Manaflow Band,Celerity,Gathering Storm,Magical Footwear,Approach Velocity


In [115]:
rune_name_to_id = {}
for rune_tree in runes_data:
    for slot in rune_tree.get('slots', []):
        for rune in slot.get('runes', []):
            rune_name_to_id[rune['name']] = rune['id']

# Create a copy of `runes_fix_df` to work with
runes_fix_df_copy = runes_fix_df.copy()

# Replace rune names with their corresponding IDs
rune_columns = ['Keystone', 'PrimarySlot1', 'PrimarySlot2', 'PrimarySlot3', 'SecondarySlot1', 'SecondarySlot2']
for column in rune_columns:
    runes_fix_df_copy[column] = runes_fix_df_copy[column].map(rune_name_to_id)

# Fill any NaN values with 0 and convert all columns to integer type
runes_fix_df_copy[rune_columns] = runes_fix_df_copy[rune_columns].fillna(0).astype(int)

In [117]:
runes_fix_df_copy.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,NA1_5151182315,1835,98,Shen,100,TOP,3,5,9,False,...,3748.0,882.0,2502.0,1365.0,8437,8401,8444,8453,8106,8126
1,NA1_5151182315,1835,104,Graves,100,JUNGLE,17,4,10,False,...,6676.0,469.0,6673.0,906.0,8128,8143,8138,8105,9111,9104
2,NA1_5151182315,1835,161,Velkoz,100,MIDDLE,7,6,7,False,...,6655.0,736.0,4645.0,1033.0,8229,8226,8210,8237,8126,8106
3,NA1_5151182315,1835,147,Seraphine,100,BOTTOM,3,5,16,False,...,2503.0,847.0,3040.0,1717.0,8214,8226,8210,8236,9105,8009
4,NA1_5151182315,1835,43,Karma,100,UTILITY,4,7,13,False,...,6653.0,765.0,3118.0,1225.0,8229,8226,8234,8236,8304,8410


In [119]:
runes_fix_df_copy.dtypes

matchId                       object
gameDuration                   int64
championId                     int64
championName                  object
teamId                         int64
individualPosition            object
kills                          int64
deaths                         int64
assists                        int64
win                             bool
goldEarned                     int64
totalDamageDealt               int64
totalDamageTaken               int64
totalHeal                      int64
matchupChampion               object
Boots_id                     float64
Boots_purchase_time          float64
Legendary_1_id               float64
Legendary_1_purchase_time    float64
Legendary_2_id               float64
Legendary_2_purchase_time    float64
Keystone                       int32
PrimarySlot1                   int32
PrimarySlot2                   int32
PrimarySlot3                   int32
SecondarySlot1                 int32
SecondarySlot2                 int32
d

In [121]:
matchup_df = runes_fix_df_copy.copy()

# Load champion data from champions.json to create a mapping from champion names to champion IDs
with open("../data/raw/champion_data/champions.json", "r") as f:
    champions_data = json.load(f)

# Create a dictionary to map champion names to champion IDs
champion_name_to_id = {}
for champ_key, champ_info in champions_data["data"].items():
    champion_name_to_id[champ_info["name"]] = int(champ_info["key"])

# Create a copy of the DataFrame to work with
matchup_df = runes_fix_df_copy.copy()

# Replace matchupChampion names with their corresponding IDs
matchup_df['matchupChampion'] = matchup_df['matchupChampion'].map(champion_name_to_id)

# Fill any NaN values with -1 (or another appropriate placeholder for champions not found)
matchup_df['matchupChampion'] = matchup_df['matchupChampion'].fillna(-1).astype(int)

In [125]:
matchup_df['matchupChampion'].head()

0     75
1     77
2    134
3    360
4    267
Name: matchupChampion, dtype: int32

In [129]:
matchup_df.dtypes

matchId                       object
gameDuration                   int64
championId                     int64
championName                  object
teamId                         int64
individualPosition            object
kills                          int64
deaths                         int64
assists                        int64
win                             bool
goldEarned                     int64
totalDamageDealt               int64
totalDamageTaken               int64
totalHeal                      int64
matchupChampion                int32
Boots_id                     float64
Boots_purchase_time          float64
Legendary_1_id               float64
Legendary_1_purchase_time    float64
Legendary_2_id               float64
Legendary_2_purchase_time    float64
Keystone                       int32
PrimarySlot1                   int32
PrimarySlot2                   int32
PrimarySlot3                   int32
SecondarySlot1                 int32
SecondarySlot2                 int32
d

In [132]:
column_removal_df = matchup_df.copy()

# Remove matchId and championName columns
column_removal_df.drop(columns=['matchId', 'championName'], inplace=True)

column_removal_df.head()

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,1835,98,100,TOP,3,5,9,False,10252,18691,...,3748.0,882.0,2502.0,1365.0,8437,8401,8444,8453,8106,8126
1,1835,104,100,JUNGLE,17,4,10,False,15951,29320,...,6676.0,469.0,6673.0,906.0,8128,8143,8138,8105,9111,9104
2,1835,161,100,MIDDLE,7,6,7,False,11626,19531,...,6655.0,736.0,4645.0,1033.0,8229,8226,8210,8237,8126,8106
3,1835,147,100,BOTTOM,3,5,16,False,9982,24219,...,2503.0,847.0,3040.0,1717.0,8214,8226,8210,8236,9105,8009
4,1835,43,100,UTILITY,4,7,13,False,10176,19337,...,6653.0,765.0,3118.0,1225.0,8229,8226,8234,8236,8304,8410


In [142]:
laning_df = column_removal_df.copy()

position_mapping = {
    'TOP': 0,
    'JUNGLE': 1,
    'MIDDLE': 2,
    'BOTTOM': 3,
    'UTILITY': 4
}

laning_df['individualPosition'] = laning_df['individualPosition'].map(position_mapping).fillna(-1).astype('int64')

laning_df

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,1835,98,100,0,3,5,9,False,10252,18691,...,3748.0,882.0,2502.0,1365.0,8437,8401,8444,8453,8106,8126
1,1835,104,100,1,17,4,10,False,15951,29320,...,6676.0,469.0,6673.0,906.0,8128,8143,8138,8105,9111,9104
2,1835,161,100,2,7,6,7,False,11626,19531,...,6655.0,736.0,4645.0,1033.0,8229,8226,8210,8237,8126,8106
3,1835,147,100,3,3,5,16,False,9982,24219,...,2503.0,847.0,3040.0,1717.0,8214,8226,8210,8236,9105,8009
4,1835,43,100,4,4,7,13,False,10176,19337,...,6653.0,765.0,3118.0,1225.0,8229,8226,8234,8236,8304,8410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,1540,31,200,0,2,9,0,False,8079,13353,...,3084.0,828.0,6664.0,1250.0,8437,8446,8429,8451,9104,8017
89996,1540,876,200,1,4,4,0,False,9133,7581,...,6653.0,719.0,3116.0,1057.0,8010,9111,9105,8017,8304,8347
89997,1540,777,200,2,0,9,2,False,8937,14602,...,3153.0,906.0,6673.0,1395.0,8021,9101,9104,8017,8444,8401
89998,1540,22,200,3,7,5,3,False,10665,19257,...,6672.0,682.0,3046.0,1073.0,8008,9111,9104,8014,8345,8410


In [146]:
bool_df = laning_df.copy()

bool_columns = laning_df.select_dtypes(include='bool').columns
bool_df[bool_columns] = laning_df[bool_columns].astype(int)

In [148]:
bool_df

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,1835,98,100,0,3,5,9,0,10252,18691,...,3748.0,882.0,2502.0,1365.0,8437,8401,8444,8453,8106,8126
1,1835,104,100,1,17,4,10,0,15951,29320,...,6676.0,469.0,6673.0,906.0,8128,8143,8138,8105,9111,9104
2,1835,161,100,2,7,6,7,0,11626,19531,...,6655.0,736.0,4645.0,1033.0,8229,8226,8210,8237,8126,8106
3,1835,147,100,3,3,5,16,0,9982,24219,...,2503.0,847.0,3040.0,1717.0,8214,8226,8210,8236,9105,8009
4,1835,43,100,4,4,7,13,0,10176,19337,...,6653.0,765.0,3118.0,1225.0,8229,8226,8234,8236,8304,8410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,1540,31,200,0,2,9,0,0,8079,13353,...,3084.0,828.0,6664.0,1250.0,8437,8446,8429,8451,9104,8017
89996,1540,876,200,1,4,4,0,0,9133,7581,...,6653.0,719.0,3116.0,1057.0,8010,9111,9105,8017,8304,8347
89997,1540,777,200,2,0,9,2,0,8937,14602,...,3153.0,906.0,6673.0,1395.0,8021,9101,9104,8017,8444,8401
89998,1540,22,200,3,7,5,3,0,10665,19257,...,6672.0,682.0,3046.0,1073.0,8008,9111,9104,8014,8345,8410


In [150]:
bool_df.dtypes

gameDuration                   int64
championId                     int64
teamId                         int64
individualPosition             int64
kills                          int64
deaths                         int64
assists                        int64
win                            int32
goldEarned                     int64
totalDamageDealt               int64
totalDamageTaken               int64
totalHeal                      int64
matchupChampion                int32
Boots_id                     float64
Boots_purchase_time          float64
Legendary_1_id               float64
Legendary_1_purchase_time    float64
Legendary_2_id               float64
Legendary_2_purchase_time    float64
Keystone                       int32
PrimarySlot1                   int32
PrimarySlot2                   int32
PrimarySlot3                   int32
SecondarySlot1                 int32
SecondarySlot2                 int32
dtype: object

In [152]:
transformed_df = bool_df.copy()

## Pipelines for Encoding