In [9]:
import sqlalchemy as sql
import psycopg2
import pandas as pd
import numpy as np
# Database credentials
db_string = open("DB.txt", "r").read()
import datetime as dt
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [10]:
def read_from_sql(table):
    '''Function that takes all of the results from a sql table
    
    Input: table - table data is extracted from
    
    Output: A dataframe containing the results'''
    engine = sql.create_engine(db_string)
    query = 'SELECT * FROM {}'.format(table)
    with engine.connect() as conn:
        df = pd.read_sql_query(query, conn)
    engine.dispose()
    return df

In [11]:
def query_builder(df, get_all = False):
    '''Returns the necessary fighter data when given a dataframe of fighter data. Also has the option to 
    return all of the fighters with an optional argument.
    
    Input: df - a dataframe containing fighter data that has the same column names as the database. 
    get_all - an optional argument that will return dataframe.
    
    Output: a dataframe that has all of the necessary fight data from the database.
    '''
    #Option to return every value
    if get_all == True:
        query = "SELECT * FROM t_fighter_series"
    #Option to return only a subset
    else:
        #Building a list to use in the query to select specific fighters
        fighters = np.append(test["r_fighter"].unique().tolist(), test["b_fighter"].unique().tolist())
        string_list = ["("]
        for fighter in fighters:
            string_list.append("'"+fighter+"'")
            string_list.append(", ")
        string_list = string_list[:-1]
        string_list.append(")")
        string = "".join(string_list)
        #Query with list of fighters inclduded
        query = "SELECT * FROM t_fighter_series WHERE fighter IN {}".format(string)
    engine = sql.create_engine(db_string)
    with engine.connect() as conn:
        df = pd.read_sql_query(query, conn)
    engine.dispose()
    return df

In [17]:
def create_features(fight_data, fighter_data, fights_back = 3):
    '''Function that creates the features that we are going to use to model with.
    
    Input: fight_data - a df containing the fight data from t_fight
    fighter_data - a df containing the fighter data from t_fighter.
    fights_back - the number of previous fights to be made into features per fighter.
    
    Ouput: a dataframe containing the features and the labels.
    '''
    #Strip out the necessary information from the fights
    red = fight_data[['r_fighter','date','id']]
    red.columns = ['fighter','date_of_fight','fight_id']
    blue = fight_data[['b_fighter','date','id']]
    blue.columns = ['fighter','date_of_fight','fight_id']
    #Concatenate them together
    combined = pd.concat([red, blue])
    merged = fighter_data.merge(combined, how = 'inner', on = 'fighter')
    #Filter any fights that are after the fight date
    stripped = merged[merged['date']<merged['date_of_fight']]
    #Count the wins and losses
    count = stripped[['fight_id','fighter','winner_boolean','win_by_categorical']].groupby(['fight_id','fighter','winner_boolean']).count().unstack()
    count.columns = count.columns.droplevel(level = 0)
    count.reset_index(inplace = True)
    count.fillna(0, inplace = True)
    count.columns.name = None
    count.columns = ['fight_id','fighter','losses','wins']
    #Subset the data according to fighter and fightid so we can extract the specific history
    fighter_subset = stripped.groupby(['fighter','fight_id']).head(fights_back).reset_index(drop=True)
    fighter_subset.set_index(['fight_id','fighter'], inplace = True)
    #Empty arrays used to store the information extracted in the for loop
    values = np.empty((0,fights_back))
    days = np.empty((0,fights_back))
    fighter_index = []
    fight_index = []
    #loop to pull values and index values from previous fights
    for idx in fighter_subset.index.unique():
        day = fighter_subset.loc[idx,'date_of_fight'].values[0]
        subset = fighter_subset.loc[idx, :]
        #if fighter only has one fight it will throw an error. Try and except used to account for that
        try:
            if isinstance(subset, pd.Series):
                subset = subset.to_frame().transpose()
                values = np.vstack([values, subset['win_by_categorical'].values])
                days = np.vstack([days, ((day - subset['date'])/np.timedelta64(1,'D')).tolist()])
                fighter_index.append(idx[1])
                fight_index.append(idx[0])
            else:
                values = np.vstack([values, subset['win_by_categorical'].values])
                days = np.vstack([days, ((day - subset['date'])/np.timedelta64(1,'D')).tolist()])
                fighter_index.append(idx[1])
                fight_index.append(idx[0])
        except ValueError:
            continue
    column_names_fights = []
    column_names_days = []
    #creating column labels for features
    for fights in range(fights_back):
        column_names_fights.append("fight_{}".format(fights+1))
    for fights in range(fights_back):
        column_names_days.append("fight_{}_days".format(fights+1))
    #The results of the previous fights are going to be made into dummies
    prev_fight = pd.DataFrame(values, columns = column_names_fights, dtype = "object")
    prev_fight_dummies = pd.get_dummies(prev_fight)
    prev_fight_days = pd.DataFrame(days) 
    features = prev_fight_dummies.merge(prev_fight_days, left_index = True, right_index = True)
    features.columns = prev_fight_dummies.columns.tolist() + column_names_days 
    features['fight_id'], features['fighter'] = fight_index, fighter_index
    #combine the results from previous fights and the count of overall wins and losses
    features = count.merge(features, how = 'inner', on = ['fight_id','fighter'])
    fight_data.set_index('id', inplace = True)
    features.set_index(['fight_id','fighter'], inplace = True)
    values = np.empty((0,len(features.columns)*2))
    counter = 0
    fight_index = []
    #final that takes links the all of the features to the fight id 
    for idx in fight_data.index.unique():
            fighters = fight_data.loc[idx, ['r_fighter','b_fighter']].values.tolist()
            try:
                fighter_1 = features.loc[(idx, fighters[0]),:].to_frame().transpose()
                fighter_2 = features.loc[(idx, fighters[1]),:].to_frame().transpose()
                fighter_data = np.hstack([fighter_1.values, fighter_2.values])
                values = np.vstack([values, fighter_data])
                fight_index.append(idx)
            except KeyError:
                counter += 1
                continue
    print("{} fights had to be excluded due to insufficient data.".format(counter))
    columns = features.columns
    prefixes = ['r_','b_']
    column_names = []
    #creating all of the names for the full feature set
    for prefix in prefixes:
        for column in columns:
            column_names.append(prefix + column)
    #creating the final feature set and linking it to the result of the fight
    full_features = pd.DataFrame(values, index = fight_index, columns = column_names)
    fight_data = fight_data['result']
    output = full_features.merge(fight_data, how = 'inner', right_index = True, left_index = True)
    return output

In [19]:
test = read_from_sql("t_fight_series")
fighters_test = query_builder(test, get_all = True)
test.head()

Unnamed: 0,id,r_fighter,b_fighter,win_by,last_round,date,winner,winner_boolean,result
0,1,Daisuke Nakamura,A-Sol Kwon,Submission,3,2007-10-28,Daisuke Nakamura,0,7
1,2,Eiji Ishikawa,A-Sol Kwon,Split Decision,2,2010-07-03,Eiji Ishikawa,0,9
2,3,Ronald Jhun,A-Sol Kwon,Submission,2,2010-11-06,A-Sol Kwon,1,1
3,4,Takasuke Kume,A-Sol Kwon,Split Decision,3,2014-08-17,A-Sol Kwon,1,3
4,5,Mansour Barnaoui,A-Sol Kwon,Submission,1,2019-05-18,Mansour Barnaoui,0,7


In [13]:
fighters_test.head()

Unnamed: 0,id,fighter,win_by,last_round,date,winner_boolean,win_by_categorical
0,1,A-Sol Kwon,Split Decision,3,2019-11-09,False,3
1,2,A-Sol Kwon,Submission,1,2019-05-18,False,1
2,3,A-Sol Kwon,Split Decision,3,2014-08-17,True,9
3,4,A-Sol Kwon,Submission,2,2010-11-06,True,7
4,5,A-Sol Kwon,Split Decision,2,2010-07-03,False,3


In [20]:
results = create_features(test, fighters_test)
results.head()

36136 fights had to be excluded due to insufficient data.


Unnamed: 0,r_losses,r_wins,r_fight_1_1.0,r_fight_1_2.0,r_fight_1_3.0,r_fight_1_4.0,r_fight_1_5.0,r_fight_1_6.0,r_fight_1_7.0,r_fight_1_8.0,...,b_fight_3_7.0,b_fight_3_8.0,b_fight_3_9.0,b_fight_3_10.0,b_fight_3_11.0,b_fight_3_12.0,b_fight_1_days,b_fight_2_days,b_fight_3_days,result
4,3.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1380.0,1506.0,2485.0,3
5,4.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1735.0,3115.0,3241.0,7
6,6.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,175.0,1910.0,3290.0,9
13,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,93.0,478.0,1232.0,4
14,2.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,238.0,331.0,716.0,10


In [65]:
#Initial test to see get a sense of the performance of different alogrithms
X, y = results.iloc[:,:-1], results.iloc[:,-1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state = 0)
dec_tree = DecisionTreeClassifier().fit(X_train, y_train)
forest = RandomForestClassifier().fit(X_train, y_train)
dummy = DummyClassifier().fit(X_train, y_train)
grad_boost = GradientBoostingClassifier().fit(X_train, y_train)
print("Decision Tree: {:.2f}".format(dec_tree.score(X_test, y_test)))
print("Training Score: {:.2f}".format(forest.score(X_test, y_test)))
print("Test Score: {:.2f}".format(grad_boost.score(X_test, y_test)))
print("Dummy Majority: {:.2f}".format(dummy.score(X_test, y_test)))



Decision Tree: 0.19
Random Forest: 0.22
Gradient Boosted: 0.29
Dummy Majority: 0.14


In [117]:
results = create_features(test, fighters_test, fights_back = 2)
results.head()

31405 fights had to be excluded due to insufficient data.


Unnamed: 0,r_losses,r_wins,r_fight_1_1.0,r_fight_1_2.0,r_fight_1_3.0,r_fight_1_4.0,r_fight_1_5.0,r_fight_1_6.0,r_fight_1_7.0,r_fight_1_8.0,...,b_fight_2_6.0,b_fight_2_7.0,b_fight_2_8.0,b_fight_2_9.0,b_fight_2_10.0,b_fight_2_11.0,b_fight_2_12.0,b_fight_1_days,b_fight_2_days,result
1,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,28.0,351.0,10
2,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,148.0,267.0,7
3,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,76.0,265.0,9
4,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,322.0,637.0,7
6,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1546.0,1655.0,7


In [119]:
X, y = results.iloc[:,:-1], results.iloc[:,-1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state = 0)
dec_tree = DecisionTreeClassifier().fit(X_train, y_train)
forest = RandomForestClassifier().fit(X_train, y_train)
dummy = DummyClassifier().fit(X_train, y_train)
grad_boost = GradientBoostingClassifier().fit(X_train, y_train)
print("Decision Tree: {:.2f}".format(dec_tree.score(X_test, y_test)))
print("Random Forest: {:.2f}".format(forest.score(X_test, y_test)))
print("Gradient Boosted: {:.2f}".format(grad_boost.score(X_test, y_test)))
print("Dummy Majority: {:.2f}".format(dummy.score(X_test, y_test)))



Decision Tree: 0.20
Random Forest: 0.23
Gradient Boosted: 0.30
Dummy Majority: 0.16


In [121]:
print("Training Score: {:.2f}".format(grad_boost.score(X_train, y_train)))
print("Test Score: {:.2f}".format(grad_boost.score(X_test, y_test)))
print("n_estimators: {}".format(grad_boost.n_estimators_))
#print("estimators: {}".format(grad_boost.estimators_))
print("classes: {}".format(grad_boost.classes_))
print("n_features: {}".format(grad_boost.n_features_))

Training Score: 0.39
Test Score: 0.30
n_estimators: 100
classes: [ 1  2  3  4  5  6  7  8  9 10 11 12]
n_features: 56


In [122]:
#First attempt at some hyperparameter tuning
X, y = results.iloc[:,:-1], results.iloc[:,-1]
y = label_binarize(y, classes = [1,2,3,5,6,7,8,9,10,11,12])
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state = 0)
param_grid = {'estimator__n_estimators':range(50,100,10)}
gb = OneVsRestClassifier(GradientBoostingClassifier())
search = GridSearchCV(gb,
                      param_grid,  
                      n_jobs = 4, 
                      iid = False, 
                      cv =5)
search.fit(X_train, y_train)
search.best_params_

{'estimator__n_estimators': 90}

In [123]:
param_grid = {'estimator__max_depth':range(2,5),
             'estimator__min_samples_split': range(2,10)}
gb = OneVsRestClassifier(GradientBoostingClassifier(n_estimators = 90))
search = GridSearchCV(gb,
                      param_grid,
                      n_jobs = 4, 
                      iid = False, 
                      cv =5)
search.fit(X_train, y_train)
search.best_params_

{'estimator__max_depth': 4, 'estimator__min_samples_split': 9}

In [124]:
param_grid = {'estimator__min_samples_leaf':range(1,4),
             'estimator__max_features': range(10,50,10)}
gb = OneVsRestClassifier(GradientBoostingClassifier(
                        n_estimators = 90,
                        max_depth = 4,
                        min_samples_split = 9))
search = GridSearchCV(gb,
                      param_grid,
                      n_jobs = 4, 
                      iid = False, 
                      cv =5)
search.fit(X_train, y_train)
search.best_params_

{'estimator__max_features': 40, 'estimator__min_samples_leaf': 1}

In [126]:
param_grid = {'estimator__subsample':[.6,.7,.75,.8,.85,.9, 1.0]}
gb = OneVsRestClassifier(GradientBoostingClassifier(
                        min_samples_leaf = 1,
                        max_features = 40,
                        n_estimators = 90,
                        max_depth = 4,
                        min_samples_split = 9))
search = GridSearchCV(gb,
                      param_grid,
                      n_jobs = 4, 
                      iid = False, 
                      cv =5)
search.fit(X_train, y_train)
search.best_params_

{'estimator__subsample': 0.8}

In [128]:
param_grid = {'estimator__learning_rate':[.1,.2, .3, .4, .5]}
gb = OneVsRestClassifier(GradientBoostingClassifier(
                        min_samples_leaf = 1,
                        max_features = 40,
                        max_depth = 4,
                        subsample = .8,
                        n_estimators = 90,
                        min_samples_split = 9,
                        ))
search = GridSearchCV(gb,
                      param_grid,
                      n_jobs = 4, 
                      iid = False, 
                      cv =5)
search.fit(X_train, y_train)
search.best_params_

{'estimator__learning_rate': 0.4}

In [129]:
gb = OneVsRestClassifier(GradientBoostingClassifier(learning_rate = .4,
                                                    min_samples_leaf = 1,
                                                    n_estimators = 90,
                                                    max_features = 40,
                                                    max_depth = 4,
                                                    subsample = .8,
                                                    min_samples_split = 9))
gb.fit(X_train, y_train)
print("Training Score: {:.2f}".format(gb.score(X_train, y_train)))
print("Test Score: {:.2f}".format(gb.score(X_test, y_test)))

Training Score: 0.36
Test Score: 0.17
