# Pre-Processing [REDACTED]

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as scp
import plotly.express as px
import matplotlib.pyplot as plt

In [3]:
def cek_null(df):
    col_na = df.isnull().sum().sort_values(ascending=False)
    percent = col_na / len(df) * 100

    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])
    print(missing_data[missing_data['Total'] > 0])

def cek_duplikat(df):
    dup = df.duplicated()
    print("Jumlah duplikasi data : " + str(dup.sum()))

Pada kasus pre-processing sebelumnya, ternyata ditemukan masalah di mana informasi Patch 0.0 menghilang dari `final_dataset.csv`

In [4]:
df_final = pd.read_csv('dataset final/final_dataset.csv')
df_final[df_final['Patch'] == 0.0]

Unnamed: 0,GameID,PlayerName,TeamAbbreviation,Agent,ACS,Kills,Deaths,Assists,PlusMinus,ADR,...,Patch,EventID,EventName,EventStage,Team1ID,Team2ID,Team1,Team2,Team1_MapScore,Team2_MapScore


Padahal di akhir processing untuk `matches.csv` informasi Patch 0.0 masih ada

In [5]:
df_matches_final = pd.read_csv('dataset final/matches_final.csv')
df_matches_final[df_matches_final['Patch'] == 0.0]

Unnamed: 0,MatchID,Date,Patch,EventID,EventName,EventStage,Team1ID,Team2ID,Team1,Team2,Team1_MapScore,Team2_MapScore
6196,209,2020-06-14 14:00:00,0.0,8,Absolute Masters,Group Stage: Group H,3,63,Ninjas in Pyjamas,InetGamer,2,0
6212,205,2020-06-13 14:30:00,0.0,8,Absolute Masters,Group Stage: Group G,21,101,Prodigy,Paraplegic Buffalos,2,0
6304,178,2020-06-04 14:00:00,0.0,8,Absolute Masters,Group Stage: Group B,86,85,SimpleMinecraftPlayers,Absolute Legends,1,2
6308,413,2020-05-25 08:00:00,0.0,20,Take the Throne,#4: Grand Final,25,20,fish123,StartedFromCS,2,1
6309,174,2020-05-24 14:30:00,0.0,8,Absolute Masters,Group Stage: Group H,102,63,KK VALORANT,InetGamer,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
6396,10,2020-05-02 15:00:00,0.0,2,Valhalla Invitational,Week 1: Group A,4,11,smoke,Obey Alliance,1,0
6397,12,2020-05-02 15:00:00,0.0,2,Valhalla Invitational,Week 1: Group B,10,8,Highground,JSD,1,0
6398,8,2020-05-02 14:00:00,0.0,2,Valhalla Invitational,Week 1: Group A,4,6,smoke,Orgless,1,0
6399,7,2020-05-02 13:00:00,0.0,2,Valhalla Invitational,Week 1: Group A,4,5,smoke,Last Minute Heroes,1,0


Kami menemukan bahwa hal ini terjadi karena pada pemrosesan `games.csv` terdapat penghilangan row data yang kolomnya berisikan value NaN, seperti yang ditujukan kode berikut, misalnya untuk `MatchID` `209`

In [6]:
df_games = pd.read_csv('dataset/games.csv')
df_games[df_games['MatchID'] == 209]

Unnamed: 0,No,GameID,MatchID,Map,Team1ID,Team2ID,Team1,Team2,Winner,Team1_Eco,Team1_SemiEco,Team1_SemiBuy,Team1_FullBuy,Team1_TotalRounds,Team2_Eco,Team2_SemiEco,Team2_SemiBuy,Team2_FullBuy,Team2_TotalRounds
12708,15635,351,209,Bind,3,63,Ninjas in Pyjamas,InetGamer,1,,,,,13,,,,,2
12709,15636,352,209,Haven,3,63,Ninjas in Pyjamas,InetGamer,1,,,,,13,,,,,9


In [7]:
df_final_games = pd.read_csv('dataset final/games_final.csv') # dataset games.csv setelah diproses
df_final_games[df_final_games['MatchID'] == 209]

Unnamed: 0,GameID,MatchID,Map,Team1ID,Team2ID,Team1,Team2,Winner,Team1_Eco,Team1_SemiEco,Team1_SemiBuy,Team1_FullBuy,Team1_TotalRounds,Team2_Eco,Team2_SemiEco,Team2_SemiBuy,Team2_FullBuy,Team2_TotalRounds


Akhirnya, kami memutuskan untuk menetapkan 2 dataset final berbeda di mana untuk dataset final yang baru ini akan ditujukan untuk analisis data yang membutuhkan informasi Patch 0.0.

## Tabel Games

In [8]:
df_games

Unnamed: 0,No,GameID,MatchID,Map,Team1ID,Team2ID,Team1,Team2,Winner,Team1_Eco,Team1_SemiEco,Team1_SemiBuy,Team1_FullBuy,Team1_TotalRounds,Team2_Eco,Team2_SemiEco,Team2_SemiBuy,Team2_FullBuy,Team2_TotalRounds
0,0,60894,62393,Breeze,6903,6020,Booster Seat Gaming,Pho Real,1,2.0,0.0,5.0,13.0,13,4.0,2.0,4.0,10.0,7
1,1,60895,62393,Bind,6903,6020,Booster Seat Gaming,Pho Real,2,3.0,3.0,5.0,4.0,2,2.0,0.0,4.0,9.0,13
2,2,60896,62393,Haven,6903,6020,Booster Seat Gaming,Pho Real,1,2.0,2.0,5.0,12.0,13,2.0,2.0,6.0,11.0,8
3,3,60924,62403,Icebox,7046,7047,Bjor's Kittens,Mugiwara,1,2.0,2.0,4.0,11.0,13,4.0,1.0,2.0,12.0,6
4,4,60925,62403,Haven,7046,7047,Bjor's Kittens,Mugiwara,1,4.0,2.0,3.0,13.0,13,3.0,3.0,4.0,12.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12956,15883,6,10,Haven,4,11,smoke,Obey Alliance,2,,,,,0,,,,,0
12957,15884,8,12,Bind,10,8,Highground,JSD,2,,,,,0,,,,,0
12958,15885,4,8,Haven,4,6,smoke,Orgless,2,,,,,0,,,,,0
12959,15886,3,7,Split,4,5,smoke,Last Minute Heroes,2,,,,,0,,,,,0


In [9]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12961 entries, 0 to 12960
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   No                 12961 non-null  int64  
 1   GameID             12961 non-null  int64  
 2   MatchID            12961 non-null  int64  
 3   Map                12961 non-null  object 
 4   Team1ID            12961 non-null  int64  
 5   Team2ID            12961 non-null  int64  
 6   Team1              12961 non-null  object 
 7   Team2              12961 non-null  object 
 8   Winner             12961 non-null  int64  
 9   Team1_Eco          11927 non-null  float64
 10  Team1_SemiEco      11927 non-null  float64
 11  Team1_SemiBuy      11927 non-null  float64
 12  Team1_FullBuy      11927 non-null  float64
 13  Team1_TotalRounds  12961 non-null  int64  
 14  Team2_Eco          11927 non-null  float64
 15  Team2_SemiEco      11927 non-null  float64
 16  Team2_SemiBuy      119

Setelah mengkaji ulang, terdapat beberapa kolom yang kurang begitu penting untuk analisis data yang melibatkan informasi `Patch`, yakni:  
`No`, karena hanya sebagai identifier  
`Team1_Eco`, `Team1_SemiEco`, `Team1_SemiBuy`, `Team1_FullBuy`, `Team2_Eco`,`Team2_SemiEco`, `Team2_SemiBuy`, `Team2_FullBuy`, semua kolom ini berisikan informasi ekonomi tim selama games. Informasi ini tidak relevan untuk analisis data yang melibatkan informasi `Patch`

In [10]:
col_to_drop = ['No', 'Team1_Eco', 'Team1_SemiEco', 'Team1_SemiBuy', 'Team1_FullBuy', 'Team2_Eco', 'Team2_SemiEco', 'Team2_SemiBuy', 'Team2_FullBuy']
df_games_col_drop = df_games.drop(col_to_drop, axis=1)
df_games_col_drop

Unnamed: 0,GameID,MatchID,Map,Team1ID,Team2ID,Team1,Team2,Winner,Team1_TotalRounds,Team2_TotalRounds
0,60894,62393,Breeze,6903,6020,Booster Seat Gaming,Pho Real,1,13,7
1,60895,62393,Bind,6903,6020,Booster Seat Gaming,Pho Real,2,2,13
2,60896,62393,Haven,6903,6020,Booster Seat Gaming,Pho Real,1,13,8
3,60924,62403,Icebox,7046,7047,Bjor's Kittens,Mugiwara,1,13,6
4,60925,62403,Haven,7046,7047,Bjor's Kittens,Mugiwara,1,13,9
...,...,...,...,...,...,...,...,...,...,...
12956,6,10,Haven,4,11,smoke,Obey Alliance,2,0,0
12957,8,12,Bind,10,8,Highground,JSD,2,0,0
12958,4,8,Haven,4,6,smoke,Orgless,2,0,0
12959,3,7,Split,4,5,smoke,Last Minute Heroes,2,0,0


In [11]:
cek_null(df_games_col_drop)

Empty DataFrame
Columns: [Total, Percent]
Index: []


Perbedaannya, Map dengan value `TBD` ikut terbawa. Informasi ini kurang relevan untuk membantu analisis kami untuk poin eksplorasi, sehingga dianggap sebagai nilai NaN dan dihilangkan

In [12]:
df_games_col_drop['Map'].unique()

array(['Breeze', 'Bind', 'Haven', 'Icebox', 'Ascent', 'Split', 'Fracture',
       'TBD'], dtype=object)

In [13]:
tbd_count = df_games_col_drop[df_games_col_drop['Map'] == 'TBD'].shape[0] # jumlah row dengan nilai TBD
tbd_count_percent = tbd_count / df_games_col_drop.shape[0] * 100
tbd_count_percent

0.46292724326826634

In [14]:
df_games_drop_tbd = df_games_col_drop[df_games_col_drop['Map'] != 'TBD']
df_games_drop_tbd['Map'].unique()

array(['Breeze', 'Bind', 'Haven', 'Icebox', 'Ascent', 'Split', 'Fracture'],
      dtype=object)

## Cek Duplikat

In [15]:
cek_duplikat(df_games_drop_tbd)

Jumlah duplikasi data : 0


In [16]:
df_games_final = df_games_drop_tbd

In [17]:
df_games_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12901 entries, 0 to 12960
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   GameID             12901 non-null  int64 
 1   MatchID            12901 non-null  int64 
 2   Map                12901 non-null  object
 3   Team1ID            12901 non-null  int64 
 4   Team2ID            12901 non-null  int64 
 5   Team1              12901 non-null  object
 6   Team2              12901 non-null  object
 7   Winner             12901 non-null  int64 
 8   Team1_TotalRounds  12901 non-null  int64 
 9   Team2_TotalRounds  12901 non-null  int64 
dtypes: int64(7), object(3)
memory usage: 1.1+ MB


# Tabel Scores

In [18]:
df_scores = pd.read_csv('dataset/scores.csv')
df_scores

Unnamed: 0,No,GameID,PlayerID,PlayerName,TeamAbbreviation,Agent,ACS,Kills,Deaths,Assists,...,Num_4Ks,Num_5Ks,OnevOne,OnevTwo,OnevThree,OnevFour,OnevFive,Econ,Plants,Defuses
0,0,60894,8419.0,Reduxx,Boos,jett,313.0,24.0,10.0,3.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0
1,1,60894,466.0,ChurmZ,Boos,chamber,227.0,16.0,10.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,2.0,0.0
2,2,60894,3712.0,diaamond,Boos,sova,226.0,17.0,9.0,8.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,58.0,3.0,0.0
3,3,60894,5099.0,Boltzy,Boos,viper,218.0,17.0,12.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0
4,4,60894,3983.0,Virtyy,Boos,skye,80.0,5.0,13.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128774,157934,13,24.0,Gover,,,0.0,0.0,0.0,0.0,...,,,,,,,,,,
128775,157935,13,25.0,Jack1,,,0.0,0.0,0.0,0.0,...,,,,,,,,,,
128776,157936,13,26.0,Rewind,,,0.0,0.0,0.0,0.0,...,,,,,,,,,,
128777,157937,13,27.0,Woo1y,,,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [19]:
df_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128779 entries, 0 to 128778
Data columns (total 29 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   No                128779 non-null  int64  
 1   GameID            128779 non-null  int64  
 2   PlayerID          128692 non-null  float64
 3   PlayerName        128779 non-null  object 
 4   TeamAbbreviation  126763 non-null  object 
 5   Agent             124679 non-null  object 
 6   ACS               128249 non-null  float64
 7   Kills             128289 non-null  float64
 8   Deaths            128289 non-null  float64
 9   Assists           128289 non-null  float64
 10  PlusMinus         127026 non-null  float64
 11  KAST_Percent      2637 non-null    float64
 12  ADR               119904 non-null  float64
 13  HS_Percent        119307 non-null  float64
 14  FirstKills        128249 non-null  float64
 15  FirstDeaths       119314 non-null  float64
 16  FKFD_PlusMinus    11

In [20]:
cek_null(df_scores)

                   Total    Percent
KAST_Percent      126142  97.952306
Defuses            10195   7.916663
OnevThree          10195   7.916663
Num_3Ks            10195   7.916663
Num_2Ks            10195   7.916663
OnevOne            10195   7.916663
Num_5Ks            10195   7.916663
OnevTwo            10195   7.916663
Num_4Ks            10195   7.916663
OnevFour           10195   7.916663
OnevFive           10195   7.916663
Econ               10195   7.916663
Plants             10195   7.916663
HS_Percent          9472   7.355236
FirstDeaths         9465   7.349801
FKFD_PlusMinus      9465   7.349801
ADR                 8875   6.891652
Agent               4100   3.183749
TeamAbbreviation    2016   1.565473
PlusMinus           1753   1.361247
FirstKills           530   0.411558
ACS                  530   0.411558
Assists              490   0.380497
Deaths               490   0.380497
Kills                490   0.380497
PlayerID              87   0.067558


In [21]:
df_scores[df_scores['PlayerName'] == 'Bob']['PlayerID'].unique()

array([ 873., 7854.])

In [22]:
cek_null(df_scores[df_scores['PlayerID'] == 873])

                Total     Percent
KAST_Percent      195  100.000000
Defuses             9    4.615385
Num_3Ks             9    4.615385
FKFD_PlusMinus      9    4.615385
FirstDeaths         9    4.615385
Num_4Ks             9    4.615385
HS_Percent          9    4.615385
ADR                 9    4.615385
Num_5Ks             9    4.615385
OnevOne             9    4.615385
OnevTwo             9    4.615385
OnevThree           9    4.615385
OnevFour            9    4.615385
OnevFive            9    4.615385
Econ                9    4.615385
Plants              9    4.615385
Num_2Ks             9    4.615385
Agent               1    0.512821


In [23]:
df_scores_col_drop = df_scores.drop(['KAST_Percent', 'PlayerID', 'No'], axis=1)
df_scores_col_drop

Unnamed: 0,GameID,PlayerName,TeamAbbreviation,Agent,ACS,Kills,Deaths,Assists,PlusMinus,ADR,...,Num_4Ks,Num_5Ks,OnevOne,OnevTwo,OnevThree,OnevFour,OnevFive,Econ,Plants,Defuses
0,60894,Reduxx,Boos,jett,313.0,24.0,10.0,3.0,14.0,195.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0
1,60894,ChurmZ,Boos,chamber,227.0,16.0,10.0,7.0,6.0,161.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,2.0,0.0
2,60894,diaamond,Boos,sova,226.0,17.0,9.0,8.0,8.0,148.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,58.0,3.0,0.0
3,60894,Boltzy,Boos,viper,218.0,17.0,12.0,2.0,5.0,141.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0
4,60894,Virtyy,Boos,skye,80.0,5.0,13.0,3.0,-8.0,55.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128774,13,Gover,,,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,
128775,13,Jack1,,,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,
128776,13,Rewind,,,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,
128777,13,Woo1y,,,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,


In [24]:
# # Untuk melihat beberapa GameID yang seharusnya tidak hilang setelah diproses

# match_patch0 = df_matches_final[df_matches_final['Patch'] == 0.0]

# matchID = []

# for label, value in match_patch0['MatchID'].items():
#     matchID.append(value)

# gameID = []

# for id in matchID:
#     game_patch0 = df_games_final[df_games_final['MatchID'] == id]
#     for game in game_patch0['GameID'].unique():
#         if game not in gameID:
#             gameID.append(game)

# print(f'gameID size = {len(gameID)}')

# patch0_agent = []

# for id in gameID:
#     data_patch0 = df_scores[df_scores['GameID'] == id]
#     agent = data_patch0['Agent'].unique()
#     if len(agent) == 1:
#         if pd.isnull(agent[0]) == False:
#             patch0_agent.append(id)
#     else:
#         for item in agent:
#             if pd.isnull(item) == False:
#                 if id not in patch0_agent:
#                     patch0_agent.append(id)

# print(f'Patch 0.0 with Agent = {len(patch0_agent)}')
# print(patch0_agent)

Pada tahapan ini hanya dilakukan penghilangan pada row dengan value Agent NaN.

In [25]:
df_scores_null_drop = df_scores_col_drop.dropna(subset=['Agent'])
cek_null(df_scores_null_drop)

                  Total   Percent
Defuses            6095  4.888554
Num_2Ks            6095  4.888554
Plants             6095  4.888554
Econ               6095  4.888554
OnevFive           6095  4.888554
OnevFour           6095  4.888554
OnevThree          6095  4.888554
OnevTwo            6095  4.888554
OnevOne            6095  4.888554
Num_5Ks            6095  4.888554
Num_4Ks            6095  4.888554
Num_3Ks            6095  4.888554
HS_Percent         5372  4.308665
FKFD_PlusMinus     5365  4.303050
FirstDeaths        5365  4.303050
ADR                4915  3.942123
TeamAbbreviation   1757  1.409219
FirstKills          165  0.132340
ACS                 165  0.132340
PlusMinus           145  0.116299
Assists             145  0.116299
Deaths              145  0.116299
Kills               145  0.116299


In [26]:
df_scores_final = df_scores_null_drop

# Menggabungkan Tabel

In [27]:
print(df_scores_final.columns)
print(df_games_final.columns)
print(df_matches_final.columns)

Index(['GameID', 'PlayerName', 'TeamAbbreviation', 'Agent', 'ACS', 'Kills',
       'Deaths', 'Assists', 'PlusMinus', 'ADR', 'HS_Percent', 'FirstKills',
       'FirstDeaths', 'FKFD_PlusMinus', 'Num_2Ks', 'Num_3Ks', 'Num_4Ks',
       'Num_5Ks', 'OnevOne', 'OnevTwo', 'OnevThree', 'OnevFour', 'OnevFive',
       'Econ', 'Plants', 'Defuses'],
      dtype='object')
Index(['GameID', 'MatchID', 'Map', 'Team1ID', 'Team2ID', 'Team1', 'Team2',
       'Winner', 'Team1_TotalRounds', 'Team2_TotalRounds'],
      dtype='object')
Index(['MatchID', 'Date', 'Patch', 'EventID', 'EventName', 'EventStage',
       'Team1ID', 'Team2ID', 'Team1', 'Team2', 'Team1_MapScore',
       'Team2_MapScore'],
      dtype='object')


In [28]:
print(f"Jumlah baris scores: {df_scores_final.count().unique()}")
print(f"Jumlah baris scores: {df_games_final.count().unique()}")
print(f"Jumlah baris scores: {df_matches_final.count().unique()}")

Jumlah baris scores: [124679 122922 124514 124534 119764 119307 119314 118584]
Jumlah baris scores: [12901]
Jumlah baris scores: [6401]


## Menggabungkan Matches dengan Games

In [29]:
df_matches_games = df_games_final.merge(df_matches_final, left_on='MatchID', right_on='MatchID')

In [30]:
df_matches_games.columns

Index(['GameID', 'MatchID', 'Map', 'Team1ID_x', 'Team2ID_x', 'Team1_x',
       'Team2_x', 'Winner', 'Team1_TotalRounds', 'Team2_TotalRounds', 'Date',
       'Patch', 'EventID', 'EventName', 'EventStage', 'Team1ID_y', 'Team2ID_y',
       'Team1_y', 'Team2_y', 'Team1_MapScore', 'Team2_MapScore'],
      dtype='object')

In [31]:
df_matches_games['Patch'].value_counts()

Patch
3.0    4748
2.0    4364
1.0    3701
0.0      88
Name: count, dtype: int64

In [32]:
df_matches_games[df_matches_games['GameID'] == 339]

Unnamed: 0,GameID,MatchID,Map,Team1ID_x,Team2ID_x,Team1_x,Team2_x,Winner,Team1_TotalRounds,Team2_TotalRounds,...,Patch,EventID,EventName,EventStage,Team1ID_y,Team2ID_y,Team1_y,Team2_y,Team1_MapScore,Team2_MapScore
12711,339,205,Bind,21,101,Prodigy,Paraplegic Buffalos,1,13,2,...,0.0,8,Absolute Masters,Group Stage: Group G,21,101,Prodigy,Paraplegic Buffalos,2,0


## Menggabungkan Tabel secara keseluruhan

In [33]:
df_final = df_scores_final.merge(df_matches_games, left_on='GameID', right_on='GameID')
df_final.columns

Index(['GameID', 'PlayerName', 'TeamAbbreviation', 'Agent', 'ACS', 'Kills',
       'Deaths', 'Assists', 'PlusMinus', 'ADR', 'HS_Percent', 'FirstKills',
       'FirstDeaths', 'FKFD_PlusMinus', 'Num_2Ks', 'Num_3Ks', 'Num_4Ks',
       'Num_5Ks', 'OnevOne', 'OnevTwo', 'OnevThree', 'OnevFour', 'OnevFive',
       'Econ', 'Plants', 'Defuses', 'MatchID', 'Map', 'Team1ID_x', 'Team2ID_x',
       'Team1_x', 'Team2_x', 'Winner', 'Team1_TotalRounds',
       'Team2_TotalRounds', 'Date', 'Patch', 'EventID', 'EventName',
       'EventStage', 'Team1ID_y', 'Team2ID_y', 'Team1_y', 'Team2_y',
       'Team1_MapScore', 'Team2_MapScore'],
      dtype='object')

Ternyata terdapat tabel dengan kolom nama yang sama namun atributnya berbeda, sehingga memunculkan beberapa kolom dengan tambahan `_x` dan `_y`

In [34]:
mask = ['Team1ID_x', 'Team1ID_y', 'Team2ID_x', 'Team2ID_y','Team1_x', 'Team1_y', 'Team2_x', 'Team2_y']
temp = df_final[mask]
temp[temp['Team1_x'] != temp['Team1_y']]

Unnamed: 0,Team1ID_x,Team1ID_y,Team2ID_x,Team2ID_y,Team1_x,Team1_y,Team2_x,Team2_y
3607,6144,6144,6478,6478,The Mafia,Next-Gen(The Mafia),ex-Serenity Black,ex-Serenity Black
3608,6144,6144,6478,6478,The Mafia,Next-Gen(The Mafia),ex-Serenity Black,ex-Serenity Black
3609,6144,6144,6478,6478,The Mafia,Next-Gen(The Mafia),ex-Serenity Black,ex-Serenity Black
3610,6144,6144,6478,6478,The Mafia,Next-Gen(The Mafia),ex-Serenity Black,ex-Serenity Black
3611,6144,6144,6478,6478,The Mafia,Next-Gen(The Mafia),ex-Serenity Black,ex-Serenity Black
...,...,...,...,...,...,...,...,...
124499,74,74,134,134,Washed Players,Team Skyyart(Washed Players),Team blackelespanolito,Team blackelespanolito
124500,74,74,134,134,Washed Players,Team Skyyart(Washed Players),Team blackelespanolito,Team blackelespanolito
124501,74,74,134,134,Washed Players,Team Skyyart(Washed Players),Team blackelespanolito,Team blackelespanolito
124502,74,74,134,134,Washed Players,Team Skyyart(Washed Players),Team blackelespanolito,Team blackelespanolito


In [35]:
mask = ['Team1ID_x', 'Team1ID_y', 'Team2ID_x', 'Team2ID_y','Team1_x', 'Team1_y', 'Team2_x', 'Team2_y']
temp = df_final[mask]
temp[temp['Team2_x'].str.len() > temp['Team2_y'].str.len()]

Unnamed: 0,Team1ID_x,Team1ID_y,Team2ID_x,Team2ID_y,Team1_x,Team1_y,Team2_x,Team2_y


Setelah mengamati data tersebut, menurut kami, data `_y` memiliki informasi yang lebih lengkap ketimbang `_x` sehingga kami memilih yang `_y`

In [36]:
col_to_drop = ['Team1ID_x', 'Team2ID_x', 'Team1_x', 'Team2_x']
df_final.drop(col_to_drop, axis=1, inplace=True)
df_final.rename(columns={'Team1ID_y': 'Team1ID', 'Team2ID_y': 'Team2ID', 'Team1_y': 'Team1', 'Team2_y': 'Team2'}, inplace=True)


In [37]:
df_final.columns

Index(['GameID', 'PlayerName', 'TeamAbbreviation', 'Agent', 'ACS', 'Kills',
       'Deaths', 'Assists', 'PlusMinus', 'ADR', 'HS_Percent', 'FirstKills',
       'FirstDeaths', 'FKFD_PlusMinus', 'Num_2Ks', 'Num_3Ks', 'Num_4Ks',
       'Num_5Ks', 'OnevOne', 'OnevTwo', 'OnevThree', 'OnevFour', 'OnevFive',
       'Econ', 'Plants', 'Defuses', 'MatchID', 'Map', 'Winner',
       'Team1_TotalRounds', 'Team2_TotalRounds', 'Date', 'Patch', 'EventID',
       'EventName', 'EventStage', 'Team1ID', 'Team2ID', 'Team1', 'Team2',
       'Team1_MapScore', 'Team2_MapScore'],
      dtype='object')

# Hasil final

In [38]:
cek_null(df_final)

                  Total   Percent
OnevFour           6095  4.890711
Num_2Ks            6095  4.890711
Defuses            6095  4.890711
Plants             6095  4.890711
Econ               6095  4.890711
OnevFive           6095  4.890711
OnevThree          6095  4.890711
OnevTwo            6095  4.890711
OnevOne            6095  4.890711
Num_5Ks            6095  4.890711
Num_3Ks            6095  4.890711
Num_4Ks            6095  4.890711
HS_Percent         5372  4.310566
FirstDeaths        5365  4.304949
FKFD_PlusMinus     5365  4.304949
ADR                4915  3.943863
TeamAbbreviation   1757  1.409841
FirstKills          165  0.132398
ACS                 165  0.132398
Deaths              145  0.116350
Kills               145  0.116350
Assists             145  0.116350
PlusMinus           145  0.116350


In [39]:
cek_duplikat(df_final)

Jumlah duplikasi data : 0


In [40]:
df_final

Unnamed: 0,GameID,PlayerName,TeamAbbreviation,Agent,ACS,Kills,Deaths,Assists,PlusMinus,ADR,...,Patch,EventID,EventName,EventStage,Team1ID,Team2ID,Team1,Team2,Team1_MapScore,Team2_MapScore
0,60894,Reduxx,Boos,jett,313.0,24.0,10.0,3.0,14.0,195.0,...,3.0,826,Nerd Street Gamers Winter Championship - Regio...,Group Stage: Decider (A),6903,6020,Booster Seat Gaming,Pho Real,2,1
1,60894,ChurmZ,Boos,chamber,227.0,16.0,10.0,7.0,6.0,161.0,...,3.0,826,Nerd Street Gamers Winter Championship - Regio...,Group Stage: Decider (A),6903,6020,Booster Seat Gaming,Pho Real,2,1
2,60894,diaamond,Boos,sova,226.0,17.0,9.0,8.0,8.0,148.0,...,3.0,826,Nerd Street Gamers Winter Championship - Regio...,Group Stage: Decider (A),6903,6020,Booster Seat Gaming,Pho Real,2,1
3,60894,Boltzy,Boos,viper,218.0,17.0,12.0,2.0,5.0,141.0,...,3.0,826,Nerd Street Gamers Winter Championship - Regio...,Group Stage: Decider (A),6903,6020,Booster Seat Gaming,Pho Real,2,1
4,60894,Virtyy,Boos,skye,80.0,5.0,13.0,3.0,-8.0,55.0,...,3.0,826,Nerd Street Gamers Winter Championship - Regio...,Group Stage: Decider (A),6903,6020,Booster Seat Gaming,Pho Real,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124619,221,chiwawa,WP,sage,312.0,28.0,14.0,11.0,14.0,,...,0.0,8,Absolute Masters,Group Stage: Group A,83,82,NeverEndingStory,Worst Players,1,2
124620,221,arch,WP,phoenix,255.0,20.0,18.0,9.0,2.0,,...,0.0,8,Absolute Masters,Group Stage: Group A,83,82,NeverEndingStory,Worst Players,1,2
124621,221,TayS,WP,cypher,240.0,23.0,17.0,6.0,6.0,,...,0.0,8,Absolute Masters,Group Stage: Group A,83,82,NeverEndingStory,Worst Players,1,2
124622,221,Duno,WP,sova,163.0,12.0,17.0,10.0,-5.0,,...,0.0,8,Absolute Masters,Group Stage: Group A,83,82,NeverEndingStory,Worst Players,1,2


Kita cek kembali informasi Patch 0.0

In [41]:
# df_final[(df_final['Patch'] == 0.0) & (df_final['Agent'] == 'reyna')] # kok ada reyna-nya

## Export dataset hasil gabungan ke csv

In [42]:
df_final.to_csv('dataset final/final_dataset_patch0_enb.csv', index=False)

Sekarang telah diperoleh dataset baru yang memiliki informasi Agent pick di Patch 0.0

In [43]:
dataset_final = pd.read_csv('dataset final/final_dataset_patch0_enb.csv')
dataset_final['Patch'].value_counts()

Patch
3.0    47073
2.0    43293
1.0    34148
0.0      110
Name: count, dtype: int64

In [46]:
test = pd.read_csv('dataset_test/matches_test.csv')
test['Patch'].value_counts()

Patch
Patch 1.12Minor updates to observing                                                 132
Patch 3.0                                                                             88
Patch 2.05                                                                            63
Patch 2.09                                                                            61
Patch 3.09                                                                            59
Patch 1.10Icebox map beta, QoL improvements                                           59
Patch 1.14Changes to Icebox, Sage, and spectating.                                    58
Patch 2.02Running accuracy changes                                                    58
Patch 3.01                                                                            56
Patch 2.11                                                                            56
Patch 2.01Split overhaul patch                                                        54
Patch 3.08     