In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.linear_model import LogisticRegression
import time

from sklearn.svm import SVC
#from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.neural_network import MLPClassifier
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation


# Import data

In [2]:
lobby_players = pd.read_csv('data/lobby_players_explored_and_preprocessed.csv')
best_features_lobby_players = pd.read_csv('data/best_features_lobby_players.csv')
pca_lobby_players = pd.read_csv('data/pca_raw_data.csv')

# Sampling

In [3]:
n_samples = 100000

sample_lobby_players = lobby_players.sample(n = n_samples, random_state = 42)
sample_beast_features = best_features_lobby_players.sample(n = n_samples, random_state = 42)
sample_pca = pca_lobby_players.sample(n = n_samples, random_state = 42)

print('lobby:', len(lobby_players), 'sample:', len(sample_lobby_players))
print('best features:', len(best_features_lobby_players), 'sample:', len(sample_beast_features))
print('pca:', len(best_features_lobby_players), 'sample:', len(sample_pca))

lobby: 180992 sample: 100000
best features: 182117 sample: 100000
pca: 182117 sample: 100000


In [4]:
def create_donut_plot(data, target_variable, title):
    target_count = data[target_variable].value_counts()
    fig = px.pie(target_count, values=target_count.values, names=target_count.index, title=title, width=800)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.update_traces(hole=0.6, hoverinfo="label+percent+name")
    fig.show()

create_donut_plot(sample_lobby_players, 'flWinner', 'Sample lobby players')
create_donut_plot(sample_beast_features, 'flWinner', 'Sample best features from lobby players')
create_donut_plot(sample_pca, 'flWinner', 'Sample pca from lobby players')

# Preparing models

In [5]:
def splitData(df, dropList, target):
    x = df.drop(dropList, axis = 1)
    y = df[target]
    return x, y

def mergeDataframes(df1, df2):
    mergedDf = pd.concat([df1, df2], axis = 1)
    return mergedDf

def separateFeatures(X, catVariables):
    numVariables = X.drop(catVariables, axis = 1).columns
    return catVariables, numVariables

def removeUselessColumns(df, columns):
    new_df = df.drop(columns, axis = 1)
    return new_df


def numHiddenLayers(num_features):
    if num_features <= 10:
        return 1
    elif num_features <= 20:
        return 2
    else:
        return 3


def neuronsPerLayer(num_features, num_hidden_layers):
    neurons_per_layer = [num_features] + [num_features // (2 ** i) for i in range(num_hidden_layers)] + [1]
    return neurons_per_layer



def createANN(activation='relu', optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']):
    num_hidden_layers = numHiddenLayers(num_of_features)
    neurons_per_layer = neuronsPerLayer(num_of_features, num_hidden_layers)

    model = Sequential()
    model.add(Dense(units=neurons_per_layer[0], activation=activation, input_shape=(num_of_features,)))

    for units in neurons_per_layer[1:-1]:
        model.add(Dense(units=units, activation=activation))

    model.add(Dense(units=neurons_per_layer[-1], activation='sigmoid'))

    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    return model


def create_baseline_models(preprocessor):
    models = {
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(),
        'KNN': KNeighborsClassifier(),
        'XGBoost': XGBClassifier(),
        'Naive Bayes': GaussianNB(),
        'Decision Tree': DecisionTreeClassifier(),
        'Neural Network': KerasClassifier(build_fn=createANN, epochs=10, batch_size=16, verbose=0)
    }

    for model_name in models:
        models[model_name] = Pipeline([
            ('preprocessor', preprocessor),
            ('model', models[model_name])
        ])

    return models


def create_grid_models(models_pipeline):
    models = {
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(),
        'KNN': KNeighborsClassifier(),
        'XGBoost': XGBClassifier(),
        'Naive Bayes': GaussianNB(),
        'Decision Tree': DecisionTreeClassifier(),
        'Neural Network': KerasClassifier(build_fn=createANN, epochs=10, batch_size=16, verbose=0)
    }

    param_grids = {
        'Logistic Regression': {
            'model__C': np.logspace(-4, 4, 20),
            'model__solver': ['liblinear'],
            'model__penalty': ['l1', 'l2'],
        },
        'Random Forest': {
            'model__n_estimators': [1000,1500,2000],
            'model__max_depth': [10, 50, 100],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4],
        },
        'KNN': {
            'model__n_neighbors': [10, 15, 20, 25],
            'model__metric': ['euclidean', 'manhattan'],
            'model__weights': ['uniform', 'distance'],
            # 'model__p': [1, 2],
            'model__leaf_size': [20, 30, 50]
        },
        'XGBoost': {
            'model__n_estimators': [500, 1000, 1500],
            'model__max_depth': [3, 4, 5],
            'model__learning_rate': [0.1, 0.01],
            'model__subsample': [0.6, 0.8, 1.0],
        },
        'Naive Bayes': {
            'model__var_smoothing': np.logspace(0, -9, num=100)
        },
        'Decision Tree': {
            'model__max_depth': [3, 5, 10],
            'model__min_samples_leaf': [20, 50, 100],
            'model__criterion': ["gini", "entropy"],
            'model__min_samples_split': [2, 5, 10]
        },
        'Neural Network': {
            'model__activation': ['relu', 'sigmoid'],
            'model__optimizer': ['adam', 'sgd'],
            'model__loss': ['binary_crossentropy'],
            'model__epochs': [10, 50, 100],
            'model__batch_size': [8, 16, 32]
        }
    }

    for model_name in models:
        models[model_name] = GridSearchCV(models_pipeline[model_name], param_grids[model_name], n_jobs=-1)

    return models



def create_models(preprocessor, with_grid):
    np.random.seed(42)

    modelsPipeline = create_baseline_models(preprocessor)

    if with_grid:
        modelsGrid = create_grid_models(modelsPipeline)
        return modelsGrid

    return modelsPipeline


def preprocessing_data(cat_features, num_features, scaler, encoder):
    ## Tratamento para variáveis categóricas
    cat_transformer = Pipeline(steps = [
        ('encoder', encoder)
    ])

    ## Tratamento para variáveis numéricas
    num_transformer = Pipeline(steps = [
        ('scaler', scaler)
    ])

    ## Transformando os dados
    preprocessor = ColumnTransformer(transformers = [
        ('numbers', num_transformer, num_features),
        ('categories', cat_transformer, cat_features)
    ])

    ## Retornando o processador de dados
    return preprocessor

def format_metrics(scores):
    avg = round(np.mean(scores), 2)
    std = round(np.std(scores), 2)
    return f'{str(avg)} ({str(std)})'

def extract_metrics(dictionary):
    data = {}
    
    for sample_name, sample_data in dictionary.items():
        data[sample_name] = {}
        
        sample_data = sample_data.rename(columns=lambda x: x.strip())  # Remove leading/trailing whitespace from column names
        
        for model_name, metrics in sample_data.items():
            data[sample_name][model_name] = {}
            
            for metric_name, metric_value in metrics.iteritems():
                metric = metric_name.strip()  # Remove leading/trailing whitespace from metric name
                data[sample_name][model_name][metric] = metric_value
    
    # Creating the DataFrame
    df = pd.DataFrame()
    
    for sample_name, models in data.items():
        sample_df = pd.DataFrame.from_dict(models, orient='index')
        sample_df.insert(0, 'Data', sample_name)
        df = pd.concat([df, sample_df], axis=0)
    
    df = df.reset_index().rename(columns={'index': 'Model'})
    df = df.set_index(['Data', 'Model'])
    
    return df



def train_models(X, y, models, cv, with_grid):
    scoring = ('accuracy', 'precision', 'recall', 'f1')
    models_scores = {}

    for model_name, model in models.items():
        start_time = time.time()

        if with_grid:
            model.fit(X, y)
            best_params = model.best_params_
            print(f"{model_name}'s best params are: {best_params}")

            # Create a new model with the best parameters
            model = model.best_estimator_
            model.set_params(**best_params)

            # Train the new model with the best parameters
            scores = cross_validate(model, X, y, cv=cv, scoring=scoring)

            cross_metrics = {
            'accuracy': format_metrics(scores['test_accuracy']),
            'precision': format_metrics(scores['test_precision']),
            'recall': format_metrics(scores['test_recall']),
            'f1': format_metrics(scores['test_f1'])
            }

            df = pd.DataFrame(cross_metrics, index=[0])  # Add index to the DataFrame
            file_path = f"grid_100k_samples/{dataName}_{model_name}.csv"
            df.to_csv(file_path, index=False)

        else:
            # model.fit(X, y)
            scores = cross_validate(model, X, y, cv=cv, scoring=scoring)
            cross_metrics = {
            'accuracy': format_metrics(scores['test_accuracy']),
            'precision': format_metrics(scores['test_precision']),
            'recall': format_metrics(scores['test_recall']),
            'f1': format_metrics(scores['test_f1'])
            }

            df = pd.DataFrame(cross_metrics, index=[0])  # Add index to the DataFrame
            file_path = f"no_grid_100k_samples/{dataName}_{model_name}.csv"
            df.to_csv(file_path, index=False)

        models_scores[model_name] = cross_metrics

        end_time = time.time()
        print(f'{model_name}: {round(end_time - start_time, 2)} seconds')

    return models_scores


def run_models(X, y, cat_features, num_features, scaler, encoder, with_grid=False):
    global num_of_features
    num_of_features = len(cat_features) + len(num_features)

    preprocessor = preprocessing_data(cat_features=cat_features, num_features=num_features, scaler=scaler, encoder=encoder)

    models = create_models(preprocessor, with_grid)
    skf = StratifiedKFold(n_splits=10)
    
    models_scores = train_models(X, y, models, skf, with_grid = with_grid)

    models_scores = pd.DataFrame(models_scores)
    models_acc = models_scores.loc['accuracy'].values
    models_names = models.keys()
    return models_acc, models_scores, models_names


def data_pipeline(data_dict, with_grid = False):
    models_acc = {}
    models_metrics = {}

    for data_name, data in data_dict.items():
        print(f'---{data_name} started---', end='\n\n')
        start_time = time.time()
        global dataName
        dataName = data_name

        X, y = splitData(data, ['flWinner'], 'flWinner')
        cat_features, num_features = separateFeatures(X, [])
        
        models_acc[data_name], models_metrics[data_name], models_names = run_models(
            X, y, cat_features, num_features, StandardScaler(), OneHotEncoder(handle_unknown='ignore'), with_grid = with_grid
        )

        end_time = time.time()
        print(f'\n---{data_name}: completed in {round(end_time - start_time, 2)} seconds---', end='\n\n')

    models_acc = pd.DataFrame(models_acc, index=models_names)
    return models_acc, models_metrics

In [6]:
sample_lobby_players.columns

Index(['qtKill', 'qtAssist', 'qtDeath', 'qtHs', 'qtBombeDefuse',
       'qtBombePlant', 'qtTk', 'qtTkAssist', 'qt1Kill', 'qt2Kill', 'qt3Kill',
       'qt4Kill', 'qt5Kill', 'qtFirstKill', 'vlDamage', 'qtHits', 'qtShots',
       'qtLastAlive', 'qtClutchWon', 'qtRoundsPlayed', 'vlLevel', 'qtSurvived',
       'qtTrade', 'qtFlashAssist', 'qtHitHeadshot', 'qtHitChest',
       'qtHitStomach', 'qtHitLeftAtm', 'qtHitRightArm', 'qtHitLeftLeg',
       'qtHitRightLeg', 'kpr', 'dpr', 'kd', 'diff', 'descMapName_de_ancient',
       'descMapName_de_dust2', 'descMapName_de_inferno',
       'descMapName_de_mirage', 'descMapName_de_nuke',
       'descMapName_de_overpass', 'descMapName_de_train',
       'descMapName_de_vertigo', 'flWinner'],
      dtype='object')

In [7]:
sample_pca.columns

Index(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'flWinner'], dtype='object')

In [8]:
sample_beast_features.columns

Index(['dpr', 'qtSurvived', 'kd', 'diff', 'qtDeath', 'kpr', 'qtKill',
       'vlDamage', 'qt2Kill', 'qtShots', 'flWinner'],
      dtype='object')

# Test models

In [9]:
data = {
    'lobby stats': sample_lobby_players,
    'best features lobby': sample_beast_features,
    'pca lobby': sample_pca
}

no_grid_accuracy, no_grid_all_metrics = data_pipeline(data, with_grid=False)
no_grid_accuracy

---lobby stats started---




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

Logistic Regression: 22.58 seconds
Random Forest: 290.22 seconds
KNN: 30.48 seconds
XGBoost: 82.17 seconds
Naive Bayes: 3.57 seconds
Decision Tree: 30.63 seconds
Neural Network: 460.67 seconds

---lobby stats: completed in 920.35 seconds---

---best features lobby started---

Logistic Regression: 3.74 seconds
Random Forest: 244.22 seconds
KNN: 20.09 seconds
XGBoost: 46.96 seconds
Naive Bayes: 1.11 seconds
Decision Tree: 10.06 seconds
Neural Network: 411.11 seconds

---best features lobby: completed in 737.31 seconds---

---pca lobby started---

Logistic Regression: 1.01 seconds
Random Forest: 481.13 seconds
KNN: 9.79 seconds
XGBoost: 87.4 seconds
Naive Bayes: 0.83 seconds
Decision Tree: 12.47 seconds
Neural Network: 392.55 seconds

---pca lobby: completed in 985.18 seconds---



Unnamed: 0,lobby stats,best features lobby,pca lobby
Logistic Regression,0.8 (0.0),0.77 (0.0),0.75 (0.01)
Random Forest,0.79 (0.01),0.75 (0.0),0.74 (0.0)
KNN,0.74 (0.0),0.74 (0.0),0.73 (0.0)
XGBoost,0.8 (0.0),0.77 (0.0),0.75 (0.0)
Naive Bayes,0.71 (0.01),0.73 (0.0),0.73 (0.01)
Decision Tree,0.73 (0.0),0.7 (0.0),0.68 (0.01)
Neural Network,0.8 (0.0),0.77 (0.0),0.75 (0.0)


In [10]:
no_grid_metrics_df = extract_metrics(no_grid_all_metrics)
no_grid_metrics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1
Data,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lobby stats,Logistic Regression,0.8 (0.0),0.8 (0.0),0.79 (0.01),0.8 (0.0)
lobby stats,Random Forest,0.79 (0.01),0.8 (0.01),0.76 (0.01),0.78 (0.01)
lobby stats,KNN,0.74 (0.0),0.74 (0.0),0.73 (0.01),0.74 (0.0)
lobby stats,XGBoost,0.8 (0.0),0.8 (0.0),0.78 (0.01),0.79 (0.0)
lobby stats,Naive Bayes,0.71 (0.01),0.72 (0.0),0.69 (0.01),0.7 (0.01)
lobby stats,Decision Tree,0.73 (0.0),0.72 (0.0),0.73 (0.0),0.72 (0.0)
lobby stats,Neural Network,0.8 (0.0),0.81 (0.01),0.78 (0.02),0.79 (0.0)
best features lobby,Logistic Regression,0.77 (0.0),0.77 (0.0),0.76 (0.01),0.76 (0.0)
best features lobby,Random Forest,0.75 (0.0),0.76 (0.0),0.73 (0.01),0.74 (0.0)
best features lobby,KNN,0.74 (0.0),0.74 (0.0),0.73 (0.0),0.74 (0.0)


In [11]:
grid_accuracy, grid_all_metrics = data_pipeline(data, with_grid=True)
grid_accuracy

---lobby stats started---



Logistic Regression's best params are: {'model__C': 1.623776739188721, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
Logistic Regression: 1701.08 seconds


In [None]:
grid_metrics_df = extract_metrics(grid_all_metrics)
grid_metrics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,precision,recall
Data,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lobby stats,Logistic Regression,0.78 (0.02),0.78 (0.03),0.77 (0.03),0.79 (0.06)
best features lobby,Logistic Regression,0.77 (0.04),0.76 (0.05),0.76 (0.05),0.77 (0.07)
pca lobby,Logistic Regression,0.77 (0.04),0.76 (0.05),0.77 (0.04),0.76 (0.07)
