In [7]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [8]:
def get_train_valid(data, add_matches_df):
    
    
    data.drop(['r_five_players_id', 'd_five_players_id'], 1, inplace=True)
    data.drop('r_org', 1, inplace=True)
    data.drop('d_org', 1, inplace=True)
    data.drop('duration', 1, inplace=True)
    

    add_matches_df.drop(['r_five_players_id', 'd_five_players_id'], 1, inplace=True)
    add_matches_df.drop('r_org', 1, inplace=True)
    add_matches_df.drop('d_org', 1, inplace=True)
    add_matches_df.drop('duration', 1, inplace=True)
    
    
    columns = []
    columns.extend([f'r_{i}_player_id' for i in range(1, 6)])
    columns.extend([f'd_{i}_player_id' for i in range(1, 6)])
    data.drop(columns, 1, inplace=True)
    
    
    add_matches_df.drop(columns, 1, inplace=True)
    heroes_unique_values = sorted(list(set(list(data[[f'r_{i}' for i in range(1, 6)]].values.ravel()) + list(data[[f'd_{i}' for i in range(1, 6)]].values.ravel()))))
    dummy_heroes_column_names = [f'hero_{i}' for i in heroes_unique_values]
    dummy_heroes_shape = (data.shape[0], len(dummy_heroes_column_names))
    dummy_heroes = pd.DataFrame(np.zeros(dummy_heroes_shape), columns=dummy_heroes_column_names).astype("int8")

    for feature in [f'r_{i}' for i in range(1, 6)]:
        encoded_feature = pd.get_dummies(data[feature], prefix="hero")
        dummy_heroes[encoded_feature.columns] += encoded_feature

    for feature in [f'd_{i}' for i in range(1, 6)]:
        encoded_feature = pd.get_dummies(data[feature], prefix="hero")
        dummy_heroes[encoded_feature.columns] -= encoded_feature
    data = pd.concat([data, dummy_heroes], axis=1)
    data.drop([f'r_{i}' for i in range(1, 6)], 1, inplace=True)
    data.drop([f'd_{i}' for i in range(1, 6)], 1, inplace=True)
    heroes_columns = encoded_feature.columns


    dummy_heroes_shape = (add_matches_df.shape[0], len(dummy_heroes_column_names))
    dummy_heroes = pd.DataFrame(np.zeros(dummy_heroes_shape), columns=dummy_heroes_column_names).astype("int8")
    for feature in [f'r_{i}' for i in range(1, 6)]:
        encoded_feature = pd.get_dummies(add_matches_df[feature], prefix="hero")
        dummy_heroes[encoded_feature.columns] += encoded_feature

    for feature in [f'd_{i}' for i in range(1, 6)]:
        encoded_feature = pd.get_dummies(add_matches_df[feature], prefix="hero")
        dummy_heroes[encoded_feature.columns] -= encoded_feature
    add_matches_df = pd.concat([add_matches_df, dummy_heroes], axis=1)
    add_matches_df.drop([f'r_{i}' for i in range(1, 6)], 1, inplace=True)
    add_matches_df.drop([f'd_{i}' for i in range(1, 6)], 1, inplace=True)
    
    
    data['r_win'] = data['r_win'].astype('int8')
    add_matches_df['r_win'] = add_matches_df['r_win'].astype('int8')
    
    data.drop(encoded_feature.columns, 1, inplace=True)
    add_matches_df.drop(encoded_feature.columns, 1, inplace=True)
    
    winrate_columns = [col for col in data.columns.values if 'winrate' in col.split('_')]
    columns_to_not_scale = list(heroes_columns.values) + winrate_columns + ['r_win']
    columns_to_scale = list(set(data.columns) - set(columns_to_not_scale))
    
    scaler = MinMaxScaler()
    scaled_columns_data = scaler.fit_transform(data[columns_to_scale])
    scaled_columns_add_matches_df = scaler.transform(add_matches_df[columns_to_scale])
    for i, col in enumerate(columns_to_scale):
        data[col] = scaled_columns_data[:, i]
        add_matches_df[col] = scaled_columns_add_matches_df[:, i]
        
        
    x_train, y_train = data.drop('r_win', 1), data['r_win'].values
    x_valid, y_valid = add_matches_df.drop('r_win', 1), add_matches_df['r_win'].values
    
    return x_train, y_train, x_valid, y_valid

In [9]:
train = pd.read_csv('train_df_five_man_more_30_games.csv')
valid = pd.read_csv('valid_df_five_man_more_30_games.csv')
x_train, y_train, x_valid, y_valid = get_train_valid(train, valid)

In [10]:
forest = RandomForestClassifier(n_estimators=400, n_jobs=-1, random_state=17, max_depth=8, max_features='sqrt')
forest.fit(x_train, y_train)
accuracy_score(y_valid, forest.predict(x_valid))

0.77952755905511806