# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")
from getMatchData import get_match_data
import numpy as np
import pandas as pd
import getPlayersData as gpd

In [2]:
match_data = get_match_data()

Getting data from yesterdays existing file


In [3]:
match_data.tail()

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat15,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15
108691,ESPORTSTMNT06_2540953,complete,,CBLOLA,2022,Split 2,0,2022-07-26 23:23:27,1,12.13,...,140.0,-1767.0,57.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0
108692,ESPORTSTMNT06_2540953,complete,,CBLOLA,2022,Split 2,0,2022-07-26 23:23:27,1,12.13,...,118.0,735.0,24.0,4.0,2.0,1.0,0.0,1.0,1.0,1.0
108693,ESPORTSTMNT06_2540953,complete,,CBLOLA,2022,Split 2,0,2022-07-26 23:23:27,1,12.13,...,22.0,-3.0,-651.0,-4.0,1.0,2.0,1.0,0.0,3.0,2.0
108694,ESPORTSTMNT06_2540953,complete,,CBLOLA,2022,Split 2,0,2022-07-26 23:23:27,1,12.13,...,464.0,2985.0,1331.0,32.0,4.0,8.0,3.0,3.0,6.0,4.0
108695,ESPORTSTMNT06_2540953,complete,,CBLOLA,2022,Split 2,0,2022-07-26 23:23:27,1,12.13,...,496.0,-2985.0,-1331.0,-32.0,3.0,6.0,4.0,4.0,8.0,3.0


In [4]:
match_data[["gameid", "teamid", "playerid", "position"]].values

array([['ESPORTSTMNT01_2690210',
        'oe:team:68911b3329146587617ab2973106e23',
        'oe:player:38e0af7278d6769d0c81d7c4b47ac1e', 'top'],
       ['ESPORTSTMNT01_2690210',
        'oe:team:68911b3329146587617ab2973106e23',
        'oe:player:637ed20b1e41be1c51bd1a4cb211357', 'jng'],
       ['ESPORTSTMNT01_2690210',
        'oe:team:68911b3329146587617ab2973106e23',
        'oe:player:d1ae0e2f9f3ac1e0e0cdcb86504ca77', 'mid'],
       ...,
       ['ESPORTSTMNT06_2540953',
        'oe:team:391e50581a00cea9f06b2d845b4de05',
        'oe:player:a5a630a5d4f0d3151f3c98d2d98834b', 'sup'],
       ['ESPORTSTMNT06_2540953',
        'oe:team:4f9786e33291cc490057a93110cbee5', nan, 'team'],
       ['ESPORTSTMNT06_2540953',
        'oe:team:391e50581a00cea9f06b2d845b4de05', nan, 'team']],
      dtype=object)

In [5]:
def invalid_games_checker(invalid_str="NA"):
    
    # Check if "NA" is present
    flag = False
    for element in match_data[["gameid", "teamid", "playerid", "position"]].values:
        for item in element:
            if not isinstance(item, float):
                if invalid_str in item:
                    flag = True
                    if invalid_str:
                        print(f"Has {invalid_str}")
                    else:
                        print("Has empty string")
                    break
        if flag:
            break

In [6]:
# check NA
invalid_games_checker(invalid_str="NA")

# check empty string
invalid_games_checker(invalid_str="")

# check nan
invalid_games_checker(invalid_str="nan")

# check null
invalid_games_checker(invalid_str="null")



Has NA
Has empty string


In [7]:
match_data.shape

(108696, 123)

# Cleaning and formatting data

In [8]:
invalid_game_ids = []

# removing the invalid game data for invalid game ids
for gid in match_data.gameid.values:
    if not isinstance(gid, float):
        if "NA" in gid:
            invalid_game_ids.append(gid)

match_data = match_data[~match_data["gameid"].isin(invalid_game_ids)].copy()

In [9]:
match_data.shape

(102840, 123)

In [10]:
# First removing the data that has empty, nan or null string
format_dict = {
    "": np.nan, 
    "nan": np.nan, 
    "null":np.nan
}

# convert all to str
match_data["gameid"] = match_data["gameid"].str.strip()
match_data["teamid"] = match_data["teamid"].str.strip()
match_data["playerid"] = match_data["playerid"].str.strip()

# replace the invalid values to np.nan
# https://sparkbyexamples.com/pandas/pandas-remap-values-in-column-with-a-dictionary-dict/#:~:text=Using%20Pandas%20DataFrame.-,replace(),regular%20expressions%20for%20regex%20substitutions.
match_data = match_data.replace({
    "gameid": format_dict,
    "teamid": format_dict,
    "playerid": format_dict,
    "position": format_dict
})

In [11]:
match_data.shape

(102840, 123)

In [12]:
# Drop nan value
match_data = match_data[match_data.gameid.notna()]
match_data = match_data[match_data.position.notna()]

In [13]:
match_data.shape

(102816, 123)

In [14]:
# Drop duplicates if any
match_data = match_data.drop_duplicates()

In [15]:
match_data.shape

(102816, 123)

- There are no duplicates

# Separating players data from match data

In [16]:
# look into the dataframe for features
player_data = match_data[match_data["position"] != "team"]
player_data.head()

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat15,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15
0,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,121.0,391.0,345.0,14.0,0.0,1.0,0.0,0.0,1.0,0.0
1,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,100.0,541.0,-275.0,-11.0,2.0,3.0,2.0,0.0,5.0,1.0
2,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,119.0,-475.0,153.0,1.0,0.0,3.0,0.0,3.0,3.0,2.0
3,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,149.0,-793.0,-1343.0,-34.0,2.0,1.0,2.0,3.0,3.0,0.0
4,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,21.0,443.0,-497.0,7.0,1.0,2.0,2.0,0.0,6.0,2.0


In [17]:
player_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85680 entries, 0 to 108693
Columns: 123 entries, gameid to opp_deathsat15
dtypes: float64(93), int64(12), object(18)
memory usage: 81.1+ MB


In [18]:
player_data.date.values
print(type(player_data.date.values[0]))

<class 'str'>


In [19]:
# Format date according to datetime
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html
# https://www.geeksforgeeks.org/convert-the-column-type-from-string-to-datetime-format-in-pandas-dataframe/
player_data = player_data.astype({"date": "datetime64"})

In [20]:
print(type(player_data.date.values[0]))

<class 'numpy.datetime64'>


# Downloading the full players data

In [21]:
gpd.players_data_to_csv(player_data)