In [186]:
import os
import json
import pandas as pd

# folder path
match_details_folder = "../data/raw/match_details"

all_match_data = []

# iterate over files in the match details folder
for filename in os.listdir(match_details_folder):
    filepath = os.path.join(match_details_folder, filename)
    
    # check if the file is a JSON file and if its size exceeds the threshold
    if filename.endswith(".json"):
        # load the JSON data
        with open(filepath, "r") as f:
            match_data = json.load(f)
            all_match_data.extend(match_data)  

df = pd.DataFrame(all_match_data)

print(df.head())

          matchId  gameDuration  championId championName  teamId  \
0  NA1_5163159498           105         799      Ambessa     100   
1  NA1_5163159498           105         234        Viego     100   
2  NA1_5163159498           105         103         Ahri     100   
3  NA1_5163159498           105         202         Jhin     100   
4  NA1_5163159498           105         161       Velkoz     100   

  individualPosition  kills  deaths  assists    win  ...  item_1  \
0                TOP      0       0        0  False  ...       0   
1             JUNGLE      0       0        0  False  ...       0   
2             MIDDLE      0       0        0  False  ...    2003   
3             BOTTOM      0       0        0  False  ...    2003   
4            UTILITY      0       0        0  False  ...    2003   

   item_purchase_time_1  item_2  item_purchase_time_2 item_3  \
0          Unknown Time       0          Unknown Time      0   
1          Unknown Time       0          Unknown Time 

In [187]:
df['championName'].value_counts()

championName
Caitlyn         1836
Graves          1606
Jhin            1492
Corki           1432
Ashe            1409
                ... 
Shyvana          160
Trundle          145
Heimerdinger     127
Quinn            114
Rammus           100
Name: count, Length: 169, dtype: int64

In [188]:
df.dtypes

matchId                 object
gameDuration             int64
championId               int64
championName            object
teamId                   int64
individualPosition      object
kills                    int64
deaths                   int64
assists                  int64
win                       bool
goldEarned               int64
totalDamageDealt         int64
totalDamageTaken         int64
totalHeal                int64
matchupChampion         object
primaryRune             object
secondaryRune           object
item_0                   int64
item_purchase_time_0    object
item_1                   int64
item_purchase_time_1    object
item_2                   int64
item_purchase_time_2    object
item_3                   int64
item_purchase_time_3    object
item_4                   int64
item_purchase_time_4    object
item_5                   int64
item_purchase_time_5    object
dtype: object

In [189]:
df.columns

Index(['matchId', 'gameDuration', 'championId', 'championName', 'teamId',
       'individualPosition', 'kills', 'deaths', 'assists', 'win', 'goldEarned',
       'totalDamageDealt', 'totalDamageTaken', 'totalHeal', 'matchupChampion',
       'primaryRune', 'secondaryRune', 'item_0', 'item_purchase_time_0',
       'item_1', 'item_purchase_time_1', 'item_2', 'item_purchase_time_2',
       'item_3', 'item_purchase_time_3', 'item_4', 'item_purchase_time_4',
       'item_5', 'item_purchase_time_5'],
      dtype='object')

In [190]:
df[['championName', 'matchupChampion']]

Unnamed: 0,championName,matchupChampion
0,Ambessa,Rumble
1,Viego,Udyr
2,Ahri,Lucian
3,Jhin,Caitlyn
4,Velkoz,Hwei
...,...,...
88135,Chogath,Ornn
88136,Lillia,Shaco
88137,Yone,Ahri
88138,Ashe,Caitlyn


In [191]:
df.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,item_1,item_purchase_time_1,item_2,item_purchase_time_2,item_3,item_purchase_time_3,item_4,item_purchase_time_4,item_5,item_purchase_time_5
0,NA1_5163159498,105,799,Ambessa,100,TOP,0,0,0,False,...,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time
1,NA1_5163159498,105,234,Viego,100,JUNGLE,0,0,0,False,...,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time
2,NA1_5163159498,105,103,Ahri,100,MIDDLE,0,0,0,False,...,2003,8,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time
3,NA1_5163159498,105,202,Jhin,100,BOTTOM,0,0,0,False,...,2003,22,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time
4,NA1_5163159498,105,161,Velkoz,100,UTILITY,0,0,0,False,...,2003,11,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time


In [192]:
new_df = df

## Need to do handling for unknown times on core items (Legendary, Boots, Starter)

In [194]:
# load item data to get item costs
with open("../data/raw/item_data/items.json", 'r') as f:
    items_data = json.load(f)
    items = items_data["data"]

# function to get item cost
def get_item_cost(item_id):
    item_info = items.get(str(item_id), {})
    return item_info.get("gold", {}).get("total", 0)

# iterate through each row to estimate missing purchase times
for index, row in new_df.iterrows():
    for i in range(6):
        item_col = f"item_{i}"
        time_col = f"item_purchase_time_{i}"

        item_id = row[item_col]
        purchase_time = row[time_col]
        item_cost = get_item_cost(item_id)

        # fill "Unknown Time" for important items (cost > 1500 or boots or starter items)
        if purchase_time == "Unknown Time" and (item_cost > 2100 or item_id in [1001, 2422, 3111, 3158, 3006, 3020] or item_id in [1054, 1055, 1056]):
            # get the previous known purchase times
            previous_times = [int(row[f"item_purchase_time_{j}"]) for j in range(i) if row[f"item_purchase_time_{j}"] != "Unknown Time"]
            
            # estimate purchase time based on the previous purchases
            if len(previous_times) > 0:
                estimated_time = max(previous_times) + (item_cost // 100) * 30  # estimate based on item cost
            else:
                estimated_time = 900  # default to 15 minutes if no prior times are available

            # update the DataFrame with the estimated time
            new_df.at[index, time_col] = estimated_time

print("Missing item purchase times have been estimated and updated.")

Missing item purchase times have been estimated and updated.


In [195]:
new_df.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,item_1,item_purchase_time_1,item_2,item_purchase_time_2,item_3,item_purchase_time_3,item_4,item_purchase_time_4,item_5,item_purchase_time_5
0,NA1_5163159498,105,799,Ambessa,100,TOP,0,0,0,False,...,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time
1,NA1_5163159498,105,234,Viego,100,JUNGLE,0,0,0,False,...,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time
2,NA1_5163159498,105,103,Ahri,100,MIDDLE,0,0,0,False,...,2003,8,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time
3,NA1_5163159498,105,202,Jhin,100,BOTTOM,0,0,0,False,...,2003,22,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time
4,NA1_5163159498,105,161,Velkoz,100,UTILITY,0,0,0,False,...,2003,11,0,Unknown Time,0,Unknown Time,0,Unknown Time,0,Unknown Time


In [247]:
# define item IDs
boot_ids = [1001, 3009, 3111, 3158, 3006, 3020]

# create a new DataFrame for holding only the essential items
filtered_df = new_df.copy()

# function to find boots and two earliest legendary items (over 2100 gold)
def filter_important_items(row):
    items_with_time = []
    boots, legendary_items = None, []

    for i in range(6):
        item_col = f"item_{i}"
        time_col = f"item_purchase_time_{i}"

        item_id = row[item_col]
        item_purchase_time = row[time_col]
        item_cost = get_item_cost(item_id)

        # boots
        if item_id in boot_ids:
            boots = (item_id, item_purchase_time)

        # legendary items
        if item_cost > 2100:
            items_with_time.append((item_id, item_purchase_time))

    # sort legendary items by purchase time to find the first two
    items_with_time = sorted(items_with_time, key=lambda x: int(x[1]) if isinstance(x[1], int) or str(x[1]).isdigit() else float('inf'))
    legendary_items = items_with_time[:2]

    # updating the filtered DataFrame
    filtered_items = ([boots] if boots else []) + legendary_items
    return {
        "Boots": boots if boots else (None, None),
        "Legendary_1": legendary_items[0] if len(legendary_items) > 0 else (None, None),
        "Legendary_2": legendary_items[1] if len(legendary_items) > 1 else (None, None),
    }

# apply the filter function to each row
filtered_items_dicts = filtered_df.apply(filter_important_items, axis=1)

# extract the filtered items to new columns in the filtered DataFrame
important_items = ["Boots", "Legendary_1", "Legendary_2"]
for important_item in important_items:
    filtered_df[f"{important_item}_id"] = filtered_items_dicts.apply(lambda x: x[important_item][0])
    filtered_df[f"{important_item}_purchase_time"] = filtered_items_dicts.apply(lambda x: x[important_item][1])

# drop original item columns
filtered_df.drop(columns=[f"item_{i}" for i in range(6)] + [f"item_purchase_time_{i}" for i in range(6)], inplace=True)

print("Filtered DataFrame with only the important items is ready.")

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [197]:
filtered_df.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,totalHeal,matchupChampion,primaryRune,secondaryRune,Boots_id,Boots_purchase_time,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time
0,NA1_5163159498,105,799,Ambessa,100,TOP,0,0,0,False,...,0,Rumble,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",,,,,,
1,NA1_5163159498,105,234,Viego,100,JUNGLE,0,0,0,False,...,167,Udyr,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",,,,,,
2,NA1_5163159498,105,103,Ahri,100,MIDDLE,0,0,0,False,...,17,Lucian,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",,,,,,
3,NA1_5163159498,105,202,Jhin,100,BOTTOM,0,0,0,False,...,0,Caitlyn,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",,,,,,
4,NA1_5163159498,105,161,Velkoz,100,UTILITY,0,0,0,False,...,0,Hwei,"{'description': 'primaryStyle', 'selections': ...","{'description': 'subStyle', 'selections': [{'p...",,,,,,


In [198]:
filtered_df.dtypes

matchId                       object
gameDuration                   int64
championId                     int64
championName                  object
teamId                         int64
individualPosition            object
kills                          int64
deaths                         int64
assists                        int64
win                             bool
goldEarned                     int64
totalDamageDealt               int64
totalDamageTaken               int64
totalHeal                      int64
matchupChampion               object
primaryRune                   object
secondaryRune                 object
Boots_id                     float64
Boots_purchase_time          float64
Legendary_1_id               float64
Legendary_1_purchase_time    float64
Legendary_2_id               float64
Legendary_2_purchase_time    float64
dtype: object

In [199]:
df['secondaryRune'].head()

0    {'description': 'subStyle', 'selections': [{'p...
1    {'description': 'subStyle', 'selections': [{'p...
2    {'description': 'subStyle', 'selections': [{'p...
3    {'description': 'subStyle', 'selections': [{'p...
4    {'description': 'subStyle', 'selections': [{'p...
Name: secondaryRune, dtype: object

In [200]:
# load the runes data from the JSON file
with open("../data/raw/runes_data/runes.json", "r") as f:
    runes_data = json.load(f)

# create a dictionary to map rune IDs to their names
rune_dict = {}
for rune_tree in runes_data:
    for slot in rune_tree["slots"]:
        for rune in slot["runes"]:
            rune_dict[rune["id"]] = rune["name"]

import ast  # to parse string representations of dictionaries if needed

# function to clean up rune data by mapping rune IDs to their names
def clean_rune_data(rune_column, rune_dict):
    readable_runes = []

    # iterate through the data in the column
    for row in rune_column:
        # convert the string representation to a dictionary if necessary
        if isinstance(row, str):
            row = ast.literal_eval(row)

        # extract rune information
        rune_style = rune_dict.get(row['style'], "Unknown Style")
        selections = row.get('selections', [])
        selection_names = [
            rune_dict.get(selection['perk'], "Unknown Rune") for selection in selections
        ]

        # create a cleaned-up representation of the runes
        readable_runes.append({
            "style": rune_style,
            "selections": selection_names
        })

    return readable_runes

# clean up the primaryRune and secondaryRune columns
filtered_df["primaryRune_clean"] = clean_rune_data(filtered_df["primaryRune"], rune_dict)
filtered_df["secondaryRune_clean"] = clean_rune_data(filtered_df["secondaryRune"], rune_dict)

# drop original columns if no longer needed
filtered_df.drop(columns=["primaryRune", "secondaryRune"], inplace=True)

# display a few rows to see the cleaned data
print(filtered_df[["primaryRune_clean", "secondaryRune_clean"]].head())

                                   primaryRune_clean  \
0  {'style': 'Unknown Style', 'selections': ['Con...   
1  {'style': 'Unknown Style', 'selections': ['Con...   
2  {'style': 'Unknown Style', 'selections': ['Ele...   
3  {'style': 'Unknown Style', 'selections': ['Dar...   
4  {'style': 'Unknown Style', 'selections': ['Sum...   

                                 secondaryRune_clean  
0  {'style': 'Unknown Style', 'selections': ['Sud...  
1  {'style': 'Unknown Style', 'selections': ['Cos...  
2  {'style': 'Unknown Style', 'selections': ['Man...  
3  {'style': 'Unknown Style', 'selections': ['Cou...  
4  {'style': 'Unknown Style', 'selections': ['Mag...  


In [201]:
filtered_df

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,totalHeal,matchupChampion,Boots_id,Boots_purchase_time,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,primaryRune_clean,secondaryRune_clean
0,NA1_5163159498,105,799,Ambessa,100,TOP,0,0,0,False,...,0,Rumble,,,,,,,"{'style': 'Unknown Style', 'selections': ['Con...","{'style': 'Unknown Style', 'selections': ['Sud..."
1,NA1_5163159498,105,234,Viego,100,JUNGLE,0,0,0,False,...,167,Udyr,,,,,,,"{'style': 'Unknown Style', 'selections': ['Con...","{'style': 'Unknown Style', 'selections': ['Cos..."
2,NA1_5163159498,105,103,Ahri,100,MIDDLE,0,0,0,False,...,17,Lucian,,,,,,,"{'style': 'Unknown Style', 'selections': ['Ele...","{'style': 'Unknown Style', 'selections': ['Man..."
3,NA1_5163159498,105,202,Jhin,100,BOTTOM,0,0,0,False,...,0,Caitlyn,,,,,,,"{'style': 'Unknown Style', 'selections': ['Dar...","{'style': 'Unknown Style', 'selections': ['Cou..."
4,NA1_5163159498,105,161,Velkoz,100,UTILITY,0,0,0,False,...,0,Hwei,,,,,,,"{'style': 'Unknown Style', 'selections': ['Sum...","{'style': 'Unknown Style', 'selections': ['Mag..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88135,NA1_5157346215,1540,31,Chogath,200,TOP,2,9,0,False,...,3714,Ornn,,,3084.0,828.0,6664.0,1250.0,"{'style': 'Unknown Style', 'selections': ['Gra...","{'style': 'Unknown Style', 'selections': ['Leg..."
88136,NA1_5157346215,1540,876,Lillia,200,JUNGLE,4,4,0,False,...,17700,Shaco,3158.0,1227.0,6653.0,719.0,3116.0,1057.0,"{'style': 'Unknown Style', 'selections': ['Con...","{'style': 'Unknown Style', 'selections': ['Mag..."
88137,NA1_5157346215,1540,777,Yone,200,MIDDLE,0,9,2,False,...,5663,Ahri,3006.0,418.0,3153.0,906.0,6673.0,1395.0,"{'style': 'Unknown Style', 'selections': ['Fle...","{'style': 'Unknown Style', 'selections': ['Sec..."
88138,NA1_5157346215,1540,22,Ashe,200,BOTTOM,7,5,3,False,...,2352,Caitlyn,3006.0,292.0,6672.0,682.0,3046.0,1073.0,"{'style': 'Unknown Style', 'selections': ['Let...","{'style': 'Unknown Style', 'selections': ['Bis..."


In [202]:
runes_fix_df = filtered_df.copy()

# function to split the primary and secondary rune data into individual columns
def split_rune_data(row):
    primary = row['primaryRune_clean']['selections']
    secondary = row['secondaryRune_clean']['selections']
    
    # ensure we have the correct number of runes (handling possible missing entries)
    keystone = primary[0] if len(primary) > 0 else None
    primary_slot_1 = primary[1] if len(primary) > 1 else None
    primary_slot_2 = primary[2] if len(primary) > 2 else None
    primary_slot_3 = primary[3] if len(primary) > 3 else None
    secondary_slot_1 = secondary[0] if len(secondary) > 0 else None
    secondary_slot_2 = secondary[1] if len(secondary) > 1 else None

    return pd.Series([keystone, primary_slot_1, primary_slot_2, primary_slot_3, secondary_slot_1, secondary_slot_2])

# apply the function to split the rune data into new columns
rune_columns = runes_fix_df.apply(split_rune_data, axis=1)

# rename the new columns
rune_columns.columns = ['Keystone', 'PrimarySlot1', 'PrimarySlot2', 'PrimarySlot3', 'SecondarySlot1', 'SecondarySlot2']

# concatenate the new columns with the original DataFrame
runes_fix_df = pd.concat([runes_fix_df, rune_columns], axis=1)

# drop the original rune columns if no longer needed
runes_fix_df.drop(columns=['primaryRune_clean', 'secondaryRune_clean'], inplace=True)

# view the updated DataFrame
print(runes_fix_df)

              matchId  gameDuration  championId championName  teamId  \
0      NA1_5163159498           105         799      Ambessa     100   
1      NA1_5163159498           105         234        Viego     100   
2      NA1_5163159498           105         103         Ahri     100   
3      NA1_5163159498           105         202         Jhin     100   
4      NA1_5163159498           105         161       Velkoz     100   
...               ...           ...         ...          ...     ...   
88135  NA1_5157346215          1540          31      Chogath     200   
88136  NA1_5157346215          1540         876       Lillia     200   
88137  NA1_5157346215          1540         777         Yone     200   
88138  NA1_5157346215          1540          22         Ashe     200   
88139  NA1_5157346215          1540         147    Seraphine     200   

      individualPosition  kills  deaths  assists    win  ...  Legendary_1_id  \
0                    TOP      0       0        0  False

In [203]:
runes_fix_df.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,NA1_5163159498,105,799,Ambessa,100,TOP,0,0,0,False,...,,,,,Conqueror,Triumph,Legend: Haste,Last Stand,Sudden Impact,Ultimate Hunter
1,NA1_5163159498,105,234,Viego,100,JUNGLE,0,0,0,False,...,,,,,Conqueror,Triumph,Legend: Alacrity,Coup de Grace,Cosmic Insight,Magical Footwear
2,NA1_5163159498,105,103,Ahri,100,MIDDLE,0,0,0,False,...,,,,,Electrocute,Taste of Blood,Eyeball Collection,Ultimate Hunter,Manaflow Band,Scorch
3,NA1_5163159498,105,202,Jhin,100,BOTTOM,0,0,0,False,...,,,,,Dark Harvest,Taste of Blood,Eyeball Collection,Treasure Hunter,Coup de Grace,Presence of Mind
4,NA1_5163159498,105,161,Velkoz,100,UTILITY,0,0,0,False,...,,,,,Summon Aery,Manaflow Band,Transcendence,Gathering Storm,Magical Footwear,Approach Velocity


In [204]:
rune_name_to_id = {}
for rune_tree in runes_data:
    for slot in rune_tree.get('slots', []):
        for rune in slot.get('runes', []):
            rune_name_to_id[rune['name']] = rune['id']

# create a copy of `runes_fix_df` to work with
runes_fix_df_copy = runes_fix_df.copy()

# replace rune names with their corresponding IDs
rune_columns = ['Keystone', 'PrimarySlot1', 'PrimarySlot2', 'PrimarySlot3', 'SecondarySlot1', 'SecondarySlot2']
for column in rune_columns:
    runes_fix_df_copy[column] = runes_fix_df_copy[column].map(rune_name_to_id)

# fill any NaN values with 0 and convert all columns to integer type
runes_fix_df_copy[rune_columns] = runes_fix_df_copy[rune_columns].fillna(0).astype(int)

In [205]:
runes_fix_df_copy.head()

Unnamed: 0,matchId,gameDuration,championId,championName,teamId,individualPosition,kills,deaths,assists,win,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,NA1_5163159498,105,799,Ambessa,100,TOP,0,0,0,False,...,,,,,8010,9111,9105,8299,8143,8106
1,NA1_5163159498,105,234,Viego,100,JUNGLE,0,0,0,False,...,,,,,8010,9111,9104,8014,8347,8304
2,NA1_5163159498,105,103,Ahri,100,MIDDLE,0,0,0,False,...,,,,,8112,8139,8138,8106,8226,8237
3,NA1_5163159498,105,202,Jhin,100,BOTTOM,0,0,0,False,...,,,,,8128,8139,8138,8135,8014,8009
4,NA1_5163159498,105,161,Velkoz,100,UTILITY,0,0,0,False,...,,,,,8214,8226,8210,8236,8304,8410


In [206]:
runes_fix_df_copy.dtypes

matchId                       object
gameDuration                   int64
championId                     int64
championName                  object
teamId                         int64
individualPosition            object
kills                          int64
deaths                         int64
assists                        int64
win                             bool
goldEarned                     int64
totalDamageDealt               int64
totalDamageTaken               int64
totalHeal                      int64
matchupChampion               object
Boots_id                     float64
Boots_purchase_time          float64
Legendary_1_id               float64
Legendary_1_purchase_time    float64
Legendary_2_id               float64
Legendary_2_purchase_time    float64
Keystone                       int32
PrimarySlot1                   int32
PrimarySlot2                   int32
PrimarySlot3                   int32
SecondarySlot1                 int32
SecondarySlot2                 int32
d

In [207]:
matchup_df = runes_fix_df_copy.copy()

# load champion data from champions.json to create a mapping from champion names to champion IDs
with open("../data/raw/champion_data/champions.json", "r") as f:
    champions_data = json.load(f)

# create a dictionary to map champion names to champion IDs
champion_name_to_id = {}
for champ_key, champ_info in champions_data["data"].items():
    champion_name_to_id[champ_info["name"]] = int(champ_info["key"])

# create a copy of the DataFrame to work with
matchup_df = runes_fix_df_copy.copy()

# replace matchupChampion names with their corresponding IDs
matchup_df['matchupChampion'] = matchup_df['matchupChampion'].map(champion_name_to_id)

# fill any NaN values with -1 
matchup_df['matchupChampion'] = matchup_df['matchupChampion'].fillna(-1).astype(int)

In [208]:
matchup_df['matchupChampion'].head()

0     68
1     77
2    236
3     51
4    910
Name: matchupChampion, dtype: int32

In [209]:
matchup_df.dtypes

matchId                       object
gameDuration                   int64
championId                     int64
championName                  object
teamId                         int64
individualPosition            object
kills                          int64
deaths                         int64
assists                        int64
win                             bool
goldEarned                     int64
totalDamageDealt               int64
totalDamageTaken               int64
totalHeal                      int64
matchupChampion                int32
Boots_id                     float64
Boots_purchase_time          float64
Legendary_1_id               float64
Legendary_1_purchase_time    float64
Legendary_2_id               float64
Legendary_2_purchase_time    float64
Keystone                       int32
PrimarySlot1                   int32
PrimarySlot2                   int32
PrimarySlot3                   int32
SecondarySlot1                 int32
SecondarySlot2                 int32
d

In [210]:
column_removal_df = matchup_df.copy()

# remove matchId and championName columns
column_removal_df.drop(columns=['matchId', 'championName'], inplace=True)

column_removal_df.head()

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,105,799,100,TOP,0,0,0,False,500,0,...,,,,,8010,9111,9105,8299,8143,8106
1,105,234,100,JUNGLE,0,0,0,False,500,0,...,,,,,8010,9111,9104,8014,8347,8304
2,105,103,100,MIDDLE,0,0,0,False,500,41,...,,,,,8112,8139,8138,8106,8226,8237
3,105,202,100,BOTTOM,0,0,0,False,500,0,...,,,,,8128,8139,8138,8135,8014,8009
4,105,161,100,UTILITY,0,0,0,False,500,0,...,,,,,8214,8226,8210,8236,8304,8410


In [211]:
laning_df = column_removal_df.copy()

position_mapping = {
    'TOP': 0,
    'JUNGLE': 1,
    'MIDDLE': 2,
    'BOTTOM': 3,
    'UTILITY': 4
}

laning_df['individualPosition'] = laning_df['individualPosition'].map(position_mapping).fillna(-1).astype('int64')

laning_df

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,105,799,100,0,0,0,0,False,500,0,...,,,,,8010,9111,9105,8299,8143,8106
1,105,234,100,1,0,0,0,False,500,0,...,,,,,8010,9111,9104,8014,8347,8304
2,105,103,100,2,0,0,0,False,500,41,...,,,,,8112,8139,8138,8106,8226,8237
3,105,202,100,3,0,0,0,False,500,0,...,,,,,8128,8139,8138,8135,8014,8009
4,105,161,100,4,0,0,0,False,500,0,...,,,,,8214,8226,8210,8236,8304,8410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88135,1540,31,200,0,2,9,0,False,8079,13353,...,3084.0,828.0,6664.0,1250.0,8437,8446,8429,8451,9104,8017
88136,1540,876,200,1,4,4,0,False,9133,7581,...,6653.0,719.0,3116.0,1057.0,8010,9111,9105,8017,8304,8347
88137,1540,777,200,2,0,9,2,False,8937,14602,...,3153.0,906.0,6673.0,1395.0,8021,9101,9104,8017,8444,8401
88138,1540,22,200,3,7,5,3,False,10665,19257,...,6672.0,682.0,3046.0,1073.0,8008,9111,9104,8014,8345,8410


In [212]:
bool_df = laning_df.copy()

bool_columns = laning_df.select_dtypes(include='bool').columns
bool_df[bool_columns] = laning_df[bool_columns].astype(int)

In [213]:
bool_df

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,105,799,100,0,0,0,0,0,500,0,...,,,,,8010,9111,9105,8299,8143,8106
1,105,234,100,1,0,0,0,0,500,0,...,,,,,8010,9111,9104,8014,8347,8304
2,105,103,100,2,0,0,0,0,500,41,...,,,,,8112,8139,8138,8106,8226,8237
3,105,202,100,3,0,0,0,0,500,0,...,,,,,8128,8139,8138,8135,8014,8009
4,105,161,100,4,0,0,0,0,500,0,...,,,,,8214,8226,8210,8236,8304,8410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88135,1540,31,200,0,2,9,0,0,8079,13353,...,3084.0,828.0,6664.0,1250.0,8437,8446,8429,8451,9104,8017
88136,1540,876,200,1,4,4,0,0,9133,7581,...,6653.0,719.0,3116.0,1057.0,8010,9111,9105,8017,8304,8347
88137,1540,777,200,2,0,9,2,0,8937,14602,...,3153.0,906.0,6673.0,1395.0,8021,9101,9104,8017,8444,8401
88138,1540,22,200,3,7,5,3,0,10665,19257,...,6672.0,682.0,3046.0,1073.0,8008,9111,9104,8014,8345,8410


In [214]:
bool_df.dtypes

gameDuration                   int64
championId                     int64
teamId                         int64
individualPosition             int64
kills                          int64
deaths                         int64
assists                        int64
win                            int32
goldEarned                     int64
totalDamageDealt               int64
totalDamageTaken               int64
totalHeal                      int64
matchupChampion                int32
Boots_id                     float64
Boots_purchase_time          float64
Legendary_1_id               float64
Legendary_1_purchase_time    float64
Legendary_2_id               float64
Legendary_2_purchase_time    float64
Keystone                       int32
PrimarySlot1                   int32
PrimarySlot2                   int32
PrimarySlot3                   int32
SecondarySlot1                 int32
SecondarySlot2                 int32
dtype: object

In [215]:
transformed_df = bool_df.copy()

In [216]:
transformed_df.head()

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,105,799,100,0,0,0,0,0,500,0,...,,,,,8010,9111,9105,8299,8143,8106
1,105,234,100,1,0,0,0,0,500,0,...,,,,,8010,9111,9104,8014,8347,8304
2,105,103,100,2,0,0,0,0,500,41,...,,,,,8112,8139,8138,8106,8226,8237
3,105,202,100,3,0,0,0,0,500,0,...,,,,,8128,8139,8138,8135,8014,8009
4,105,161,100,4,0,0,0,0,500,0,...,,,,,8214,8226,8210,8236,8304,8410


In [251]:
transformed_df.dtypes

gameDuration                   int64
championId                     int64
teamId                         int64
individualPosition             int64
kills                          int64
deaths                         int64
assists                        int64
win                            int32
goldEarned                     int64
totalDamageDealt               int64
totalDamageTaken               int64
totalHeal                      int64
matchupChampion                int32
Boots_id                     float64
Boots_purchase_time          float64
Legendary_1_id               float64
Legendary_1_purchase_time    float64
Legendary_2_id               float64
Legendary_2_purchase_time    float64
Keystone                       int32
PrimarySlot1                   int32
PrimarySlot2                   int32
PrimarySlot3                   int32
SecondarySlot1                 int32
SecondarySlot2                 int32
dtype: object

In [253]:
columns_to_convert = [
    'Boots_id', 'Boots_purchase_time',
    'Legendary_1_id', 'Legendary_1_purchase_time',
    'Legendary_2_id', 'Legendary_2_purchase_time'
]
for column in columns_to_convert:
    transformed_df[column] = transformed_df[column].fillna(0).astype('int64')

In [255]:
transformed_df.dtypes

gameDuration                 int64
championId                   int64
teamId                       int64
individualPosition           int64
kills                        int64
deaths                       int64
assists                      int64
win                          int32
goldEarned                   int64
totalDamageDealt             int64
totalDamageTaken             int64
totalHeal                    int64
matchupChampion              int32
Boots_id                     int64
Boots_purchase_time          int64
Legendary_1_id               int64
Legendary_1_purchase_time    int64
Legendary_2_id               int64
Legendary_2_purchase_time    int64
Keystone                     int32
PrimarySlot1                 int32
PrimarySlot2                 int32
PrimarySlot3                 int32
SecondarySlot1               int32
SecondarySlot2               int32
dtype: object

In [257]:
transformed_df.head()

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,105,799,100,0,0,0,0,0,500,0,...,0,0,0,0,8010,9111,9105,8299,8143,8106
1,105,234,100,1,0,0,0,0,500,0,...,0,0,0,0,8010,9111,9104,8014,8347,8304
2,105,103,100,2,0,0,0,0,500,41,...,0,0,0,0,8112,8139,8138,8106,8226,8237
3,105,202,100,3,0,0,0,0,500,0,...,0,0,0,0,8128,8139,8138,8135,8014,8009
4,105,161,100,4,0,0,0,0,500,0,...,0,0,0,0,8214,8226,8210,8236,8304,8410


In [259]:
output_path = "../data/processed/transformed_data.csv"
transformed_df.to_csv(output_path, index=False)

In [261]:
loaded_df = pd.read_csv("../data/processed/transformed_data.csv")

In [265]:
loaded_df

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,105,799,100,0,0,0,0,0,500,0,...,0,0,0,0,8010,9111,9105,8299,8143,8106
1,105,234,100,1,0,0,0,0,500,0,...,0,0,0,0,8010,9111,9104,8014,8347,8304
2,105,103,100,2,0,0,0,0,500,41,...,0,0,0,0,8112,8139,8138,8106,8226,8237
3,105,202,100,3,0,0,0,0,500,0,...,0,0,0,0,8128,8139,8138,8135,8014,8009
4,105,161,100,4,0,0,0,0,500,0,...,0,0,0,0,8214,8226,8210,8236,8304,8410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88135,1540,31,200,0,2,9,0,0,8079,13353,...,3084,828,6664,1250,8437,8446,8429,8451,9104,8017
88136,1540,876,200,1,4,4,0,0,9133,7581,...,6653,719,3116,1057,8010,9111,9105,8017,8304,8347
88137,1540,777,200,2,0,9,2,0,8937,14602,...,3153,906,6673,1395,8021,9101,9104,8017,8444,8401
88138,1540,22,200,3,7,5,3,0,10665,19257,...,6672,682,3046,1073,8008,9111,9104,8014,8345,8410


In [267]:
loaded_df = loaded_df[loaded_df['gameDuration'] >= 1200]

loaded_df.reset_index(drop=True, inplace=True)

print(loaded_df.head())

   gameDuration  championId  teamId  individualPosition  kills  deaths  \
0          2203         516     100                   0      5       6   
1          2203         254     100                   1     12       9   
2          2203         202     100                   3      5      10   
3          2203         134     100                   2     11       8   
4          2203          40     100                   4      1       5   

   assists  win  goldEarned  totalDamageDealt  ...  Legendary_1_id  \
0       12    0       14278             39863  ...            3068   
1       12    0       14699             40145  ...            3078   
2       16    0       13550             38251  ...            3031   
3       14    0       17384             54600  ...            6655   
4       22    0        9047              9208  ...            3222   

   Legendary_1_purchase_time  Legendary_2_id  Legendary_2_purchase_time  \
0                        819            6665               

In [269]:
loaded_df

Unnamed: 0,gameDuration,championId,teamId,individualPosition,kills,deaths,assists,win,goldEarned,totalDamageDealt,...,Legendary_1_id,Legendary_1_purchase_time,Legendary_2_id,Legendary_2_purchase_time,Keystone,PrimarySlot1,PrimarySlot2,PrimarySlot3,SecondarySlot1,SecondarySlot2
0,2203,516,100,0,5,6,12,0,14278,39863,...,3068,819,6665,1262,8437,8446,8473,8451,8009,9105
1,2203,254,100,1,12,9,12,0,14699,40145,...,3078,740,6610,1122,8010,9111,9104,8014,8304,8347
2,2203,202,100,3,5,10,16,0,13550,38251,...,3031,893,3094,1309,8021,8009,9103,8017,8304,8347
3,2203,134,100,2,11,8,14,0,17384,54600,...,6655,505,4628,1082,8369,8321,8313,8347,8210,8236
4,2203,40,100,4,1,5,22,0,9047,9208,...,3222,1074,6620,1590,8214,8226,8234,8232,8136,8105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74545,1540,31,200,0,2,9,0,0,8079,13353,...,3084,828,6664,1250,8437,8446,8429,8451,9104,8017
74546,1540,876,200,1,4,4,0,0,9133,7581,...,6653,719,3116,1057,8010,9111,9105,8017,8304,8347
74547,1540,777,200,2,0,9,2,0,8937,14602,...,3153,906,6673,1395,8021,9101,9104,8017,8444,8401
74548,1540,22,200,3,7,5,3,0,10665,19257,...,6672,682,3046,1073,8008,9111,9104,8014,8345,8410


In [284]:
output_path = "../data/processed/transformed_data.csv"
loaded_df.to_csv(output_path, index=False)