# Data preprocessing

In [1]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Accounting for items, stars and characters
def expand_units(dct):  
    string = ""         
    for elem in dct.keys():
        if len(dct[elem]['items']) > 0:
            for item in dct[elem]['items']:
                col_name=str(dct[elem]['stars'])+'_'+str(elem)+'_'+('_'.join(str(item) for item in dct[elem]['items'])) #1
            string+=col_name+', '   
        else:
            col_name=str(dct[elem]['stars'])+'_'+str(elem)
            string+=col_name+', '
    return string

# Accounting for items and characters
def expand_units2(dct):
    string = ""
    for elem in dct.keys():
        if len(dct[elem]['items'])>0:
            for item in dct[elem]['items']:
                col_name=str(elem)+'_'+('_'.join(str(item) for item in dct[elem]['items']))
            string+=col_name+', ' 
        else:
            col_name=str(elem)
            string+=col_name+', '
    return string

# Accounting for characters
def expand_units3(dct):
    string = ""
  #  sep = ','
    for elem in dct.keys():
        col_name=str(elem)
        string+=col_name+', '
    return string[:-2]

# Stars
def expand_level(dct, sep=','):
    string = ""
    for elem in dct.keys():
        how_many=len(dct[elem]['items'])
        if how_many>0:
            col_name=str(dct[elem]['stars'])
            string+=(col_name+', ')   
        else:
            col_name=str(dct[elem]['stars'])
            string+=col_name+', '
    return string[:-2]

# Traits name
def expand_traits(dct, sep=','):
    string = ""
    for elem in dct.keys():
        string+=elem+', '
    return string[:-2]

# Traits level
def expand_traits_level(dct, sep=','):
    string = ""
    for elem in dct.values():
        string+=str(elem)+', '
    return string[:-2]

# Link tuple key-value in a single row
def explode_units(df):
    df['tmp']=df.apply(lambda row: list(zip(row['levels'],row['units_lst'])), axis=1) 
    df=df.explode('tmp')
    df[['levels','units_lst']]=pd.DataFrame(df['tmp'].tolist(), index=df.index)
    df.drop(columns='tmp', inplace=True)
    return df

def explode_traits(df):
    df['tmp']=df.apply(lambda row: list(zip(row['traits_level'],row['traits_lst'])), axis=1) 
    df=df.explode('tmp')
    df[['traits_level','traits_lst']]=pd.DataFrame(df['tmp'].tolist(), index=df.index)
    df.drop(columns='tmp', inplace=True)
    return df

In [3]:
data=pd.read_csv('TFT_set4_euw_challanger_games.csv')

In [4]:
data["patch_version"].value_counts()

10.25    258800
10.24    154336
10.23    102560
10.22     97336
10.21     61168
10.20     42872
10.19     22696
11.10      9160
Name: patch_version, dtype: int64

In [5]:
# this is the reason why we have to employ Pandas: pySpark is not able to work with nested dicts
data['units']=data.units.apply(ast.literal_eval)
data['traits']=data.traits.apply(ast.literal_eval)

In [6]:
# Unpack dictionaries
data['units_lst']=data.units.apply(expand_units3)
data['levels']=data.units.apply(expand_level)
data['traits_lst']=data.traits.apply(expand_traits)
data['traits_level']=data.traits.apply(expand_traits_level)

In [7]:
data.to_csv('data_modificato.csv') # A copy as checkpoint

## Partial encoding

We do not actually employ this strategy... (but it gives us a grasp of what is going on)

In [None]:
x=[]
for elem in data.units.apply(lambda x: x.keys()):
    x+=elem
len(set(x))

There are 58 characters...

In [None]:
x=[]
for elem in data.units_lst.apply(lambda x: x.split(', ')):
    x+=elem
len(set(x))

... Which can be combined obtaining 9212 combinations --> This number is too big. It leads to a MemoryError. What can we do? We do not account for the level of characters.

In [None]:
x=[]
for elem in data.units_lst2.apply(lambda x: x.split(', ')):
    x+=elem
len(set(x))

And indeed we have 3140 combinations, which are still too many...

## Full encoding

In [8]:
data=pd.read_csv('data_modificato.csv', index_col=0)

In [9]:
data['units_lst']=data.units_lst.apply(lambda x: str(x).split(', '))
data['levels']=data.levels.apply(lambda x: str(x).split(', '))
data['traits_lst']=data.traits_lst.apply(lambda x: str(x).split(', '))
data['traits_level']=data.traits_level.apply(lambda x: str(x).split(', '))

In [10]:
df1=explode_units(data)
df1 = df1.loc[:, ['levels', 'units_lst']]
df1['levels']=df1.levels.apply(lambda x: int(x) if x!='nan' else 0)
df1.to_csv('data_full_encoded_units.csv')

In [11]:
data

Unnamed: 0,patch_version,match_id,puuid,placement,level,gold_left,last_round,time_in_game,total_dmg_to_players,players_eliminated,chosen_unit,chosen_trait,units,traits,units_lst,levels,traits_lst,traits_level,tmp
0,10.19,EUW1_4817498958,KzseHzU3hoWVccjzPLRHVynRYxLT_Ww27nQpZSMXsa4YFp...,1,9,22,40,2461.936768,181,2,,,"{'Nidalee': {'items': [99, 5, 36], 'stars': 2}...","{'Divine': 1, 'Emperor': 1, 'Fortune': 3, 'Kee...","[Nidalee, Teemo, Irelia, Yuumi, Jinx, Katarina...","[2, 2, 2, 2, 2, 2, 2, 2, 2]","[Divine, Emperor, Fortune, Keeper, Adept, Assa...","[1, 1, 3, 1, 2, 1, 1, 2, 1, 2, 1, 4, 3]","[(2, Nidalee), (2, Teemo), (2, Irelia), (2, Yu..."
1,10.19,EUW1_4817498958,0i6nCStgF5A3Bg-xmHvcS9Htjxo3t-BMcGBTfgG_VDc8H0...,2,9,1,40,2461.936768,161,0,,,"{'Elise': {'items': [], 'stars': 2}, 'TwistedF...","{'Cultist': 8, 'Divine': 1, 'Duelist': 2, 'Dus...","[Elise, TwistedFate, Pyke, Kalista, Evelynn, A...","[2, 2, 2, 3, 2, 2, 2, 2, 2]","[Cultist, Divine, Duelist, Dusk, Keeper, Assas...","[8, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]","[(2, Elise), (2, TwistedFate), (2, Pyke), (3, ..."
2,10.19,EUW1_4817498958,xbmh7BXRjtNjZaUGkiY7kPAbx85q6CqkmR2CpijLu9Icz7...,3,9,1,37,2279.858154,116,1,,,"{'Elise': {'items': [77, 57, 17], 'stars': 2},...","{'Cultist': 6, 'Emperor': 1, 'Keeper': 2, 'Ass...","[Elise, Zed, Pyke, Evelynn, Aatrox, Jhin, Azir...","[2, 2, 1, 2, 2, 2, 1, 1, 2]","[Cultist, Emperor, Keeper, Assassin, Mystic, N...","[6, 1, 2, 1, 1, 1, 4, 1, 1, 1, 1]","[(2, Elise), (2, Zed), (1, Pyke), (2, Evelynn)..."
3,10.19,EUW1_4817498958,O7P3r5EcNYdWWWaXR_pRtrE92G-Recxed8nwTCe5FsNBNG...,4,8,1,35,2189.656006,125,3,,,"{'Maokai': {'items': [], 'stars': 2}, 'Hecarim...","{'Boss': 1, 'Dusk': 1, 'Assassin': 1, 'Brawler...","[Maokai, Hecarim, Lulu, Nunu, Akali, Veigar, S...","[2, 2, 3, 2, 2, 2, 2, 2]","[Boss, Dusk, Assassin, Brawler, Elderwood, Mag...","[1, 1, 1, 3, 6, 3, 1, 1]","[(2, Maokai), (2, Hecarim), (3, Lulu), (2, Nun..."
4,10.19,EUW1_4817498958,OpwDkmPPsNS2P-1XKuR_jpDkScZFzPYYbk4eUJ8gMTfVV-...,5,8,2,34,2123.929199,80,0,,,"{'Maokai': {'items': [], 'stars': 2}, 'Irelia'...","{'Divine': 2, 'Hunter': 3, 'Adept': 2, 'Brawle...","[Maokai, Irelia, Yuumi, Kindred, Warwick, Ashe...","[2, 2, 2, 3, 2, 2, 2, 2]","[Divine, Hunter, Adept, Brawler, Dazzler, Elde...","[2, 3, 2, 2, 1, 2, 2, 2, 1, 2]","[(2, Maokai), (2, Irelia), (2, Yuumi), (3, Kin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748923,11.10,EUW1_5020121764,d4DgJxwivxJGLjjGyd7B1pnNXWwK6nl04IVXRHnF0Br9m6...,4,8,1,34,1878.187378,97,1,Kindred,Hunter,"{'Garen': {'items': [67], 'stars': 1}, 'Jarvan...","{'Cultist': 2, 'Emperor': 1, 'Fortune': 1, 'Hu...","[Garen, JarvanIV, Vi, Pyke, Kindred, Katarina,...","[1, 2, 2, 1, 2, 2, 2, 1]","[Cultist, Emperor, Fortune, Hunter, Keeper, As...","[2, 1, 1, 2, 2, 2, 1, 1, 2, 6]","[(1, Garen), (2, JarvanIV), (2, Vi), (1, Pyke)..."
748924,11.10,EUW1_5020121764,gcIoQY-mb-OIYwzkQfB9pDvCrmsCofUCi0PUSFNxcMX4Ij...,5,8,1,33,1821.966309,81,0,Zed,Shade,"{'Zed': {'items': [22, 69, 16], 'stars': 3}, '...","{'Cultist': 2, 'Dusk': 1, 'Keeper': 1, 'Adept'...","[Zed, Pyke, Akali, Kennen, Evelynn, Cassiopeia...","[3, 2, 2, 2, 2, 2, 1, 2]","[Cultist, Dusk, Keeper, Adept, Assassin, Mysti...","[2, 1, 1, 1, 2, 2, 4, 4, 1]","[(3, Zed), (2, Pyke), (2, Akali), (2, Kennen),..."
748925,11.10,EUW1_5020121764,DbeBQYfXc2j06t_l6yRjLmjuabo1To_OWys6hhRb5gzM_u...,6,9,0,31,1726.449097,120,0,Lillia,Mage,"{'JarvanIV': {'items': [], 'stars': 2}, 'Ireli...","{'Divine': 1, 'Duelist': 1, 'Dusk': 1, 'Fortun...","[JarvanIV, Irelia, Yuumi, XinZhao, Ahri, Sejua...","[2, 1, 2, 2, 1, 2, 2, 2, 1]","[Divine, Duelist, Dusk, Fortune, Keeper, Adept...","[1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 2, 1, 2, 2, 3]","[(2, JarvanIV), (1, Irelia), (2, Yuumi), (2, X..."
748926,11.10,EUW1_5020121764,iXwIYX5PjES_IudRXmnID8jOrAPUr6XjDb1JNvra3anRVw...,7,8,1,31,1729.049438,80,0,Riven,Dusk,"{'Vayne': {'items': [], 'stars': 1}, 'Jhin': {...","{'Cultist': 2, 'Dusk': 4, 'Emperor': 1, 'Keepe...","[Vayne, Jhin, Riven, Shen, Azir, Yone, Lillia,...","[1, 1, 2, 1, 1, 1, 2, 1]","[Cultist, Dusk, Emperor, Keeper, Adept, Exile,...","[2, 4, 1, 2, 2, 1, 1, 2, 1, 2, 1]","[(1, Vayne), (1, Jhin), (2, Riven), (1, Shen),..."


In [12]:
df1.head()

Unnamed: 0,levels,units_lst
0,2,Nidalee
0,2,Teemo
0,2,Irelia
0,2,Yuumi
0,2,Jinx


In [13]:
df2=explode_traits(data)
df2 = df2.loc[:, ['traits_level','traits_lst']]
df2['traits_level']=df2.traits_level.apply(lambda x: int(x) if x!='nan' else 0)

In [14]:
df2.head()

Unnamed: 0,traits_level,traits_lst
0,1,Divine
0,1,Emperor
0,3,Fortune
0,1,Keeper
0,2,Adept


In [15]:
df2.to_csv('data_full_encoded_traits.csv')