In [18]:
from data import Instance, DataSet
from miscellaneous import initialize_data, plot_graph, plot_points
import matplotlib.pyplot as plt
import pandas as pd
import copy
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import seaborn as sns
import warnings
import itertools
import time
from sklearn.neighbors import KNeighborsRegressor as knn
from sklearn.metrics import mean_squared_log_error
warnings.filterwarnings('ignore')

In [19]:
start_time = time.time()

# Carregando dados
print('Carregando dados ...', end = '')
champ_stats = pd.read_csv('../dataset/champ_stats.csv')
champs = pd.read_csv('../dataset/champs.csv')
mtime = pd.read_csv('../dataset/matches_MODIFIED.csv')
participants = pd.read_csv('../dataset/participants.csv')
stats1 = pd.read_csv('../dataset/stats1_MODIFIED.csv')
stats2 = pd.read_csv('../dataset/stats2_MODIFIED.csv')
stats = pd.concat([stats1, stats2])
print( 28*'.'+' OK')

# Ajustando nomes
print('Ajustando nomes[1] ...', end = '')
champ_stats_labels = [i.lower() for i in champ_stats.columns.values]
rename_dict_champ_stats = { i: j for i,j in zip(champ_stats.columns.values, champ_stats_labels) }
champ_stats.rename(columns = rename_dict_champ_stats, inplace = True)
print( 26*'.'+' OK')

# Ajustando nomes
print('Ajustando nomes[2] ...', end = '')
champs_labels = [i.lower() for i in champs.columns.values]
rename_dict_champs = { i: j for i,j in zip(champs.columns.values, champs_labels) }
champs.rename(columns = rename_dict_champs, inplace = True)
print( 26*'.'+' OK')

# Removendo instâncias desnecessárias
print('Removendo instâncias desnecessárias ...', end = '')
champ_stats['name'] = champ_stats['name'].str.replace('\'', '')
champ_stats['name'] = champ_stats['name'].str.replace('\. ', '')
champ_stats = champ_stats[champ_stats['name'].isin(champs['name'])].reset_index(drop = True)
champs = champs.sort_values('name').reset_index(drop = True)
print( 9*'.'+' OK')

# Reduzindo dimensionalidade
print('Reduzindo dimensionalidade ...', end = '')
params = ['hp', 'hp5', 'mp', 'mp5', 'ad', 'ar', 'mr']
for param in params:
    champ_stats[param] = champ_stats[param] + 10*champ_stats[param+'+']
    champ_stats.drop(columns = [param+'+'], inplace = True)
    
champ_stats['as'] = champ_stats['as'] * ((1 + champ_stats['as+'])**10)
champ_stats.drop(columns = ['as+'], inplace = True)
print( 18*'.'+' OK')

# Removendo atributos desnecessários
print('Removendo atributos desnecessários ...', end = '')
#participants.drop(columns = ['player', 'ss1', 'ss2', 'id', 'matchid'], inplace = True)
participants.drop(columns = ['player', 'ss1', 'ss2'], inplace = True)
stats.drop(columns = ['trinket', 'assists', 'firstblood'], inplace = True)
print( 10*'.'+' OK')

# Juntando tabelas
print('Juntando tabelas ...', end = '')
participants.set_index('id', inplace = True)
stats.set_index('id', inplace = True)
dataset = pd.DataFrame.copy(participants)
dataset = dataset.join(pd.DataFrame.copy(stats))
print( 15*'.'+' OK')

# One-hot encoding para role e position
print('One-hot encoding para role e position ...', end = '')
dataset = pd.concat([dataset, pd.get_dummies( dataset['role'], prefix = 'role')],axis=1)
dataset = pd.concat([dataset, pd.get_dummies( dataset['position'], prefix = 'pos')],axis=1)
dataset.drop(['role', 'position'], axis = 1, inplace = True)
if 'role_DUO' not in dataset.columns.values:
    dataset['role_DUO'] = 0
print( 7*'.'+' OK        ')

# Substituindo IDs por nomes dos champions
print('Substituindo IDs por nomes dos champions ...', end = '')
s = champs.set_index('id')['name']
dataset['championid'] = dataset['championid'].replace(s)
print( 4*'.'+' OK       ')
        
# Juntando tabelas
print('Juntando tabelas ...', end = '')
params = ['hp', 'hp5', 'mp', 'mp5', 'ad', 'ar', 'as', 'mr', 'ms', 'range']
s = champ_stats.set_index('name')
for param in params:
    dataset[param] = dataset['championid']
    dataset[param] = dataset[param].replace(s[param])
dataset = dataset.reset_index(drop = True)
print( 28*'.'+' OK')

# Normalizando parametros
print('Normalizando duração ...', end = '')
min_max_s = MinMaxScaler(feature_range = (0.0, 1.0))
for col in ['hp', 'hp5', 'mp', 'mp5', 'ad', 'ar', 'mr', 'ms', 'range']:
    dataset[col] = min_max_s.fit_transform(dataset[col].values.reshape(-1, 1))
print( 20*'.'+' OK')

# Removendo o nome dos champios
print('Removendo o nome dos champios ...', end = '')
dataset.drop(columns = ['championid'], inplace = True)
print( 15*'.'+' OK      ')

# Reconstruindo linhas apropriadamente
print('Reconstruindo linhas apropriadamente ...', end = '')
tmp = pd.DataFrame()
columnNames = dataset.columns.values.tolist()
p = 0
for i in range(2):
    for j in range(5):
        ij = str(i)+str(j)
        p = dataset[i*5+j::10].rename( columns = { name: name+ij for name in columnNames} )
        if(tmp.empty):
            tmp = pd.DataFrame.copy(p).reset_index(drop = True)
        else:
            tmp = tmp.join(pd.DataFrame.copy(p).reset_index(drop = True))
dataset = tmp
print( 8*'.'+' OK')

# Removendo dados duplicados
print('Removendo dados duplicados ...', end = '')
dataset.dropna()
dataset['matchid'] = dataset['matchid00']
dataset['win0'] = dataset['win00']
dataset['win1'] = dataset['win10']
dataset['kills0'] = dataset['kills00'] + dataset['kills01'] + dataset['kills02'] + dataset['kills03'] + dataset['kills04']
dataset['kills1'] = dataset['kills10'] + dataset['kills11'] + dataset['kills12'] + dataset['kills13'] + dataset['kills14']
dataset['deaths0'] = dataset['deaths00'] + dataset['deaths01'] + dataset['deaths02'] + dataset['deaths03'] + dataset['deaths04']
dataset['deaths1'] = dataset['deaths10'] + dataset['deaths11'] + dataset['deaths12'] + dataset['deaths13'] + dataset['deaths14']
dataset['kd_ratio0'] = dataset['kills0']/dataset['deaths0']
dataset['kd_ratio1'] = dataset['kills1']/dataset['deaths1']

for i in range(2):
    for j in range(5):
        ij = str(i)+str(j)
        dataset.drop(columns = ['win'+ij, 'kills'+ij, 'deaths'+ij, 'matchid'+ij], inplace = True)
    dataset.drop(columns = ['deaths'+str(i), 'kills'+str(i)], inplace = True)

dataset = dataset.join(mtime)
dataset.drop(columns = ['id', 'matchid'], inplace = True)
print( 8*'.'+' OK')

# Normalizando duração
print('Normalizando duração ...', end = '')
min_max_s = MinMaxScaler(feature_range = (0.0, 1.0))
for col in ['duration']:
    dataset[col] = min_max_s.fit_transform(dataset[col].values.reshape(-1, 1))
print( 20*'.'+' OK')

# Calculando dificuldade
print('Calculando dificuldade ...', end = '')
kd0 = pd.Series.copy(dataset['kd_ratio0'])
kd0[ kd0 > 1.0 ] = 1.0
kd0[ kd0 < 1.0 ] = 0.0
kd1 = 1.0 - kd0
win0 = dataset['win0']
win1 = dataset['win1']

durationTerm = 1.0 - dataset['duration']
killTerm = 0.5 + 0.5 * (dataset['kd_ratio0'] * kd1 + dataset['kd_ratio1'] * kd0) * (win0 * kd0 + win1 * kd1) - 0.5 * (dataset['kd_ratio0'] * kd1 + dataset['kd_ratio1'] * kd0) * (win0 * kd1 + win1 * kd0)
    
dataset['dificult'] = durationTerm * 0.4 + killTerm * 0.6
dataset['dificult0'] = 0.5 + 0.5*( dataset['dificult'] * dataset['win1'] - dataset['dificult'] * dataset['win0']  )
dataset['dificult1'] = 0.5 + 0.5*( dataset['dificult'] * dataset['win0'] - dataset['dificult'] * dataset['win1']  )
dataset.drop(columns = ['win0', 'win1', 'dificult', 'duration'], inplace = True)
print( 20*'.'+' OK')
        
elapsed_time = time.time() - start_time
print('Tempo total ' + 33*'.' + ' ' + time.strftime("%M:%S", time.gmtime(elapsed_time)))
                                                       
for i in range(2):
    for j in range(5):
        ij = str(i)+str(j)
        dataset.drop(columns = ['hp5'+ij, 'mr'+ij, 'role_NONE'+ij], inplace = True)

Carregando dados ............................... OK
Ajustando nomes[1] ............................. OK
Ajustando nomes[2] ............................. OK
Removendo instâncias desnecessárias ............ OK
Reduzindo dimensionalidade ..................... OK
Removendo atributos desnecessários ............. OK
Juntando tabelas .................. OK
One-hot encoding para role e position .......... OK        
Substituindo IDs por nomes dos champions ....... OK       
Juntando tabelas ............................... OK
Normalizando duração ....................... OK
Removendo o nome dos champios .................. OK      
Reconstruindo linhas apropriadamente ........... OK
Removendo dados duplicados ........... OK
Normalizando duração ....................... OK
Calculando dificuldade ....................... OK
Tempo total ................................. 04:24


In [20]:
new_io = pd.DataFrame.copy(dataset)
new_io = new_io.dropna()
params = new_io.columns.values.tolist()

team0win = new_io[new_io['dificult0'] < 0.25].filter(params[0:80])
team1win = new_io[new_io['dificult1'] < 0.25].filter(params[80:160])

x = team0win.filter(params[0:64], axis = 1)
y = team0win.filter(params[64:80], axis = 1)

finalDataset = DataSet() 
for inst_x, inst_y in zip(x.iterrows(), y.iterrows()):
    index_x, data_x = inst_x
    index_y, data_y = inst_y
    finalDataset.add(Instance(data_x.tolist(), data_y.tolist()))
    
x = team1win.filter(params[80:144], axis = 1)
y = team1win.filter(params[144:160], axis = 1)

for inst_x, inst_y in zip(x.iterrows(), y.iterrows()):
    index_x, data_x = inst_x
    index_y, data_y = inst_y
    finalDataset.add(Instance(data_x.tolist(), data_y.tolist()))

if(new_io.isnull().values.any()):
    print(new_io.isnull().sum().sum())
else:
    print('Tudo OK')

Tudo OK


In [23]:
X = []
y = []

for inst in finalDataset.data():
    X.append(inst.input)
    y.append(inst.expected_output)

neigh = knn(n_neighbors = 3, weights = 'distance', algorithm = 'kd_tree', n_jobs = 2)
neigh.fit(X[0:int(finalDataset.size()*0.8)], y[0:int(finalDataset.size()*0.8)])

print(mean_squared_log_error(neigh.predict(X[int(finalDataset.size()*0.8):]), y[int(finalDataset.size()*0.8):]))

0.02968412978321977


In [24]:
neigh = knn(n_neighbors = 5, weights = 'distance', algorithm = 'kd_tree', n_jobs = 2)
neigh.fit(X[0:int(finalDataset.size()*0.8)], y[0:int(finalDataset.size()*0.8)])

print(mean_squared_log_error(neigh.predict(X[int(finalDataset.size()*0.8):]), y[int(finalDataset.size()*0.8):]))

0.027715326192116495


In [25]:
neigh = knn(n_neighbors = 7, weights = 'distance', algorithm = 'kd_tree', n_jobs = 2)
neigh.fit(X[0:int(finalDataset.size()*0.8)], y[0:int(finalDataset.size()*0.8)])

print(mean_squared_log_error(neigh.predict(X[int(finalDataset.size()*0.8):]), y[int(finalDataset.size()*0.8):]))

0.026879149747582327


In [26]:
neigh = knn(n_neighbors = 7, weights = 'uniform', algorithm = 'kd_tree', n_jobs = 2)
neigh.fit(X[0:int(finalDataset.size()*0.8)], y[0:int(finalDataset.size()*0.8)])

print(mean_squared_log_error(neigh.predict(X[int(finalDataset.size()*0.8):]), y[int(finalDataset.size()*0.8):]))

0.026805896183037414


In [29]:
neigh = knn(n_neighbors = 101, weights = 'distance', algorithm = 'kd_tree', n_jobs = 2)
neigh.fit(X[0:int(finalDataset.size()*0.8)], y[0:int(finalDataset.size()*0.8)])

print(mean_squared_log_error(neigh.predict(X[int(finalDataset.size()*0.8):]), y[int(finalDataset.size()*0.8):]))

0.02639638293407264
