In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random

In [None]:
df = pd.read_csv('combined_processed.csv')

#useless once the new data is loaded
df = df.dropna()
df = df[df['player_one_colour'] != 'white']
df = df[df['player_two_colour'] != 'white'].reset_index(drop=True)
df['score_player_one'] = df['score_player_one'].apply(lambda x: int(x))
df['turns_played'] = df['turns_played'].apply(lambda x: int(x))

In [None]:
def get_df_for_pair(df_original: pd.DataFrame, town1: str, town2: str):
    '''plug in town2 as "all" to get statistics for town1 against all towns (except itself)'''
    
    df = df_original.copy()
    
    # Deals with town1
    town_on_left = df['player_one_town'] == town1
    town_on_right = df['player_two_town'] == town1
    mask = (town_on_left | town_on_right) & ~(town_on_left & town_on_right) #logical xor
    df = df[mask].reset_index(drop=True)
    
    # Inverts the statistics if needed. The town1 will always now be on the left (player 1)
    for i in range(df.shape[0]):
        row = df.loc[i]
        if row['player_two_town'] == town1:
            #town switching:
            row['player_one_town'], row['player_two_town'] = row['player_two_town'], row['player_one_town']
            #hero switching:
            row['player_one_town'], row['player_two_town'] = row['player_two_town'], row['player_one_town']
            #colour_switching:
            row['player_one_town'], row['player_two_town'] = row['player_two_town'], row['player_one_town']
            #bidding inversion:
            row['bidding_amount'] = -1*row['bidding_amount']
            df.loc[i] = row
    
    # Deals with town 2
    if town2 != 'all':
        df = df[df['player_two_town'] == town2]
        df = df.reset_index(drop=True)
        
    return df

In [None]:
df2 = get_df_for_pair(df, 'castle', 'all')

In [None]:
def get_optimal_split(df):
    '''plug in a town vs. town dataframe'''
    
    df_won = df[df['score_player_one'] == 1.0]
    df_lost = df[df['score_player_one'] == 0.0]
    
    X_won = np.array(df_won['bidding_amount']).reshape(-1, 1)
    X_lost = np.array(df_lost['bidding_amount']).reshape(-1, 1)
    
    kmeans_won = KMeans(n_clusters=1, random_state=0, n_init=2).fit(X_won)
    kmeans_lost= KMeans(n_clusters=1, random_state=0, n_init=2).fit(X_lost)
    
    print(kmeans_won.cluster_centers_)
    print(kmeans_lost.cluster_centers_)
    
    optimal_value =  int((kmeans_won.cluster_centers_[0][0] + kmeans_lost.cluster_centers_[0][0])/2)
    
    return optimal_value

In [None]:
optimal_trade_value = get_optimal_split(df2)

In [None]:
df['year-quarter'] = [0] * df.shape[0]
df['quarters_passed'] = [0] * df.shape[0]
for i in range(df.shape[0]):
    row = df.loc[i]
    date = datetime.strptime(row['created_at'], '%Y-%m-%d %H:%M:%S')
    #date = datetime.strptime(row['created_at'], '%d/%m/%Y %H:%M')
    #The above converts the datetime into datetimes with year as year and quarter as month
    row['year-quarter'] = datetime(year = date.year, month = (date.month-1)//3 + 1, day=1)
    df.loc[i, 'year-quarter'] = row['year-quarter']
    df.loc[i, 'quarters_passed'] = row['year-quarter'].month + (row['year-quarter'].year-2021)*4

In [None]:
df

In [None]:
def get_regression_predictions(df, quarters=5) -> tuple:
    '''"quarters" determines how many quarters into the future we would like to see'''
    
    #Get the independent values (X), so just the quarters passed
    X_medians = df['quarters_passed'].unique().reshape(-1, 1)
    X_bottom_quantiles = df['quarters_passed'].unique().reshape(-1, 1)
    X_top_quantiles = df['quarters_passed'].unique().reshape(-1, 1)
    
    #Get the outcomes (bid median and quantiles):
    y_medians = np.array(df.groupby(['year-quarter'])['bidding_amount'].median())
    y_bottom_quantiles = np.array(df.groupby(['year-quarter'])['bidding_amount'].quantile(0.2))
    y_top_quantiles = np.array(df.groupby(['year-quarter'])['bidding_amount'].quantile(0.8))
    
    #Generate the models (median and quantiles models):
    lm_medians = LinearRegression().fit(X = X_medians, y = y_medians)
    lm_bottom_quantiles = LinearRegression().fit(X = X_bottom_quantiles, y = y_bottom_quantiles)
    lm_top_quantiles = LinearRegression().fit(X = X_top_quantiles, y = y_top_quantiles)
    
    # Create the next quarters to predict upon:
    X_pred = [[df['quarters_passed'].max()+i+1] for i in range(quarters)]
    
    # Create the predictions:
    y_pred_medians = lm_medians.predict(X_pred)
    y_pred_bottom_quantiles = lm_bottom_quantiles.predict(X_pred)
    y_pred_top_quantiles = lm_top_quantiles.predict(X_pred)
    
    # Extract the inner lists from X_pred:
    X_pred = [element[0] for element in X_pred]
    
    # Round the predictions:
    y_pred_medians = [int(element) for element in y_pred_medians]
    y_pred_bottom_quantiles = [int(element) for element in y_pred_bottom_quantiles]
    y_pred_top_quantiles = [int(element) for element in y_pred_top_quantiles]
    
    # Append the quarters and predictions into the ground values:
    y_medians = list(y_medians) + y_pred_medians
    y_bottom_quantiles = list(y_bottom_quantiles) + y_pred_bottom_quantiles
    y_top_quantiles = list(y_top_quantiles) + y_pred_top_quantiles
    
    # Returning as a dataframe:
    df_predictions = pd.DataFrame({'quarters_passed': list(df['quarters_passed'].unique())+X_pred, 'median': y_medians,
                                   'bottom_quantile': y_bottom_quantiles, 'top_quantile': y_top_quantiles})
    return df_predictions

In [None]:
df_predictions = get_regression_predictions(df, 5)

In [None]:
df_predictions

### Prepare the dataframe purely for the tree classifier:

In [None]:
def create_dataset_for_tree(df_original: pd.DataFrame, heroes=True, template_type='XL+U', test_size = 0.2, random_state=0):
    '''if heroes=False, all the heroes columns are not included
    The output data will be converted to one-hot encodings
    If test_size=0, X_test and y_test will be None'''
    
    #Filtering the columns:
    if heroes:
        df = df_original[['score_player_one', 'player_one_town', 'player_one_hero', 'player_one_colour', 'player_two_town',
             'player_two_hero', 'bidding_amount', 'turns_played', 'template_type']]
    else:
        df = df_original[['score_player_one', 'player_one_town', 'player_one_colour', 
                        'player_two_town', 'bidding_amount', 'turns_played', 'template_type']]
    
    #Filtering the template type:
    df = df[df['template_type'] == template_type]
    
    #Converting to one-hot encoding:
    df = pd.get_dummies(df, drop_first=False).dropna().reset_index(drop=True)
    df = df.drop(['template_type_'+template_type], axis=1)
    
    #Creating dependent and independent variables:
    X = df.drop('score_player_one', axis=1)
    y = df['score_player_one']
    
    if test_size != 0:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return X_train, X_test, y_train, y_test
    else:
        return X, None, y, None


In [None]:
X_train, X_test, y_train, y_test = create_dataset_for_tree(df, heroes=True, template_type='XL+U', test_size=0)

In [None]:
#Creating a home-brewed random forest for trees NOT dealing with hero choice
forest_no_heroes = [0 for _ in range(10)]
X_train, X_test, y_train, y_test = create_dataset_for_tree(df, heroes=False, template_type='XL+U',
                                                           test_size=0, random_state=2)
for i in range(10):
    forest_no_heroes[i] = DecisionTreeClassifier(random_state=i, 
                                                 splitter='best', 
                                                 max_depth = random.randint(3, 6), 
                                                 min_samples_split = random.randint(3, 6), 
                                                 max_features = 1).fit(X_train, y_train) #5 and 5 originally, not random
                                                        

#Creating a home-brewed random forest for trees dealing with hero choice
forest_heroes = [0 for _ in range(10)]
X_train, X_test, y_train, y_test = create_dataset_for_tree(df, heroes=True, template_type='XL+U', 
                                                           test_size=0, random_state=4)
for i in range(10):
    forest_heroes[i] = DecisionTreeClassifier(random_state=i, 
                                              splitter='best', 
                                              max_depth = random.randint(3, 6), 
                                              min_samples_split = random.randint(3, 6), 
                                              max_features = 1).fit(X_train, y_train) #5 and 5 originally, not random


In [None]:
def forest_prediction(i: int, X: pd.DataFrame, forests: tuple):
    '''i = number of instance in the dataframe X
    pass the X with heroes!
    FIRST pass forest with heroes, THEN forest without heroes!'''
    
    #Prediction for the forest using heroes:
    forest_0_prediction = [tree.predict_proba(np.array(X.loc[i]).reshape(1, -1))[0] for tree in forests[0]]
    forest_0_prediction = np.sum(np.array(forest_0_prediction), axis=0)
    
    #Removing the heroes columns:
    X = X.drop([header for header in X.columns if 'hero' in header], axis=1)
    
    #Prediction for the forest NOT using heroes:
    forest_1_prediction = [tree.predict_proba(np.array(X.loc[i]).reshape(1, -1))[0] for tree in forests[1]]
    forest_1_prediction = np.sum(np.array(forest_1_prediction), axis=0)
    
    #Summing the two predictions:
    return forest_0_prediction + forest_1_prediction

In [None]:
forest_prediction(32, X_train, (forest_heroes, forest_no_heroes))

In [None]:
#forest_no_heroes_prediction = [tree.predict_proba(np.array(X_train.loc[34]).reshape(1, -1)) for tree in forest_no_heroes]
forest_heroes_prediction = [tree.predict_proba(np.array(X_train.loc[34]).reshape(1, -1))[0] for tree in forest_heroes]
forest_heroes_prediction = np.array(forest_heroes_prediction)

In [None]:
#Useful in checking one tree:

fig = plt.figure(figsize=((25,20)))
plot_tree(clf,
            feature_names = X_train.columns,
            class_names=['loss', 'win'], 
            impurity=False,
            proportion=True,
            filled=True);
#fig.savefig('test.png')

In [2]:
from functions import *

In [3]:
df = pd.read_csv("set2.csv")
df

Unnamed: 0,result,town,hero,color,bidding,opponent_town,opponent_hero,turns,template,template_type
0,0.0,tower,solmyr,red,2200.0,dungeon,gunnar,18.0,Nostalgia/TP,XL+U
1,1.0,dungeon,gunnar,blue,-2200.0,tower,solmyr,18.0,Nostalgia/TP,XL+U
2,0.0,stronghold,gundula,red,900.0,stronghold,crag hack,11.0,h3dm1/3,Mirror
3,1.0,stronghold,crag hack,blue,-900.0,stronghold,gundula,11.0,h3dm1/3,Mirror
4,0.0,rampart,giselle,red,900.0,rampart,aeris,24.0,h3dm1/3,Mirror
...,...,...,...,...,...,...,...,...,...,...
8047,0.0,dungeon,shakti,blue,2000.0,castle,valeska,9.0,Jebus Cross,Jebus
8048,1.0,rampart,mephala,blue,-5500.0,rampart,jenova,5.0,mt_Firewalk,Mirror
8049,0.0,rampart,jenova,red,5500.0,rampart,mephala,5.0,mt_Firewalk,Mirror
8050,0.0,necropolis,thant,red,4600.0,fortress,alkin,4.0,Rally,XL+U


In [4]:
X, y, X_hero, y_hero = prepare_df_for_model(df)
forest_no_hero, forest_hero = create_random_forest_model(X, y, X_hero, y_hero)

In [5]:
input_df = pd.DataFrame([[1000, "necropolis", "stronghold"]], columns=["bidding", "town", "opponent_town"])
input_df = pd.get_dummies(input_df, drop_first=False).dropna().reset_index(drop=True)

In [6]:
run_model("necropolis", "stronghold",  False, "", "", 1000, forest_no_hero, X)

0.49656466283985806