In [15]:
import sqlalchemy as sql
import psycopg2
import pandas as pd
import numpy as np
### db_string = ###
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
def read_from_sql(table):
    '''Function that takes all of the results from a sql table
    
    Input: table - table data is extracted from
    
    Output: A dataframe containing the results'''
    engine = sql.create_engine(db_string)
    query = 'SELECT * FROM {}'.format(table)
    with engine.connect() as conn:
        df = pd.read_sql_query(query, conn)
    engine.dispose()
    return df

In [10]:
def query_builder(df, get_all = False):
    '''Returns the necessary fighter data when given a dataframe of fighter data. Also has the option to 
    return all of the fighters with an optional argument.
    
    Input: df - a dataframe containing fighter data that has the same column names as the database. 
    get_all - an optional argument that will return dataframe.
    
    Output: a dataframe that has all of the necessary fight data from the database.
    '''
    #Option to return every value
    if get_all == True:
        query = "SELECT * FROM t_fighter_series"
    #Option to return only a subset
    else:
        #Building a list to use in the query to select specific fighters
        fighters = np.append(test["r_fighter"].unique().tolist(), test["b_fighter"].unique().tolist())
        string_list = ["("]
        for fighter in fighters:
            string_list.append("'"+fighter+"'")
            string_list.append(", ")
        string_list = string_list[:-1]
        string_list.append(")")
        string = "".join(string_list)
        #Query with list of fighters inclduded
        query = "SELECT * FROM t_fighter_series WHERE fighter IN {}".format(string)
    engine = sql.create_engine(db_string)
    with engine.connect() as conn:
        df = pd.read_sql_query(query, conn)
    engine.dispose()
    return df

In [4]:
def create_features(fight_data, fighter_data, fights_back = 3):
    '''Function that creates the features that we are going to use to model with'''
    #Strip out the necessary information from the fights
    red = fight_data[['r_fighter','date','fight_id']]
    red.columns = ['fighter','date_of_fight','fight_id']
    blue = fight_data[['b_fighter','date','fight_id']]
    blue.columns = ['fighter','date_of_fight','fight_id']
    #Concatenate them together
    combined = pd.concat([red, blue])
    merged = fighter_data.merge(combined, how = 'inner', on = 'fighter')
    #Filter any fights that are after the fight date
    stripped = merged[merged['date']<merged['date_of_fight']]
    #Count the wins and losses
    count = stripped[['fight_id','fighter','winner_boolean','win_by_categorical']].groupby(['fight_id','fighter','winner_boolean']).count().unstack()
    count.columns = count.columns.droplevel(level = 0)
    count.reset_index(inplace = True)
    count.fillna(0, inplace = True)
    #Subset the data according to fighter and fightid so we can extract the specific history
    fighter_subset = stripped.groupby(['fighter','fight_id']).head(fights_back).reset_index(drop=True)
    fighter_subset.set_index(['fight_id','fighter'], inplace = True)
    #Empty arrays used to store the information extracted in the for loop
    values = np.empty((0,fights_back))
    days = np.empty((0,fights_back))
    fighter_index = []
    fight_index = []
    for idx in fighter_subset.index.unique():
        day = fighter_subset.loc[idx,'date_of_fight'].values[0]
        subset = fighter_subset.loc[idx, :]
        try:
            if isinstance(subset, pd.Series):
                subset = subset.to_frame().transpose()
                values = np.vstack([values, subset['win_by_categorical'].values])
                days = np.vstack([days, ((day - subset['date'])/np.timedelta64(1,'D')).tolist()])
                fighter_index.append(idx[1])
                fight_index.append(idx[0])
            else:
                values = np.vstack([values, subset['win_by_categorical'].values])
                days = np.vstack([days, ((day - subset['date'])/np.timedelta64(1,'D')).tolist()])
                fighter_index.append(idx[1])
                fight_index.append(idx[0])
        except ValueError:
            continue
    column_names = []
    feature_array = np.hstack([values, days])
    for fights in range(fights_back):
        column_names.append("fight_{}".format(fights+1))
    for fights in range(fights_back):
        column_names.append("fight_{}_days".format(fights+1))
    features = pd.DataFrame(feature_array, index = [fight_index, fighter_index], columns = column_names)
    features.index.names = ['fight_id', 'fighter']
    features.reset_index(level = [0,1], inplace = True)
    count.columns.name = None
    count.columns = ['fight_id','fighter','losses','wins']
    features = count.merge(features, how = 'inner', on = ['fight_id','fighter'])
    fight_data.set_index('fight_id', inplace = True)
    features.set_index(['fight_id','fighter'], inplace = True)
    values = np.empty((0,len(features.columns)*2))
    counter = 0
    fight_index = []
    for idx in fight_data.index.unique():
            fighters = fight_data.loc[idx, ['r_fighter','b_fighter']].values.tolist()
            try:
                fighter_1 = features.loc[(idx, fighters[0]),:].to_frame().transpose()
                fighter_2 = features.loc[(idx, fighters[1]),:].to_frame().transpose()
                fighter_data = np.hstack([fighter_1.values, fighter_2.values])
                values = np.vstack([values, fighter_data])
                fight_index.append(idx)
            except KeyError:
                counter += 1
                continue
    print("{} fights had to be excluded due to insufficient data.".format(counter))
    columns = features.columns
    prefixes = ['r_','b_']
    column_names = []
    for prefix in prefixes:
        for column in columns:
            column_names.append(prefix + column)
    full_features = pd.DataFrame(values, index = fight_index, columns = column_names)
    fight_data = fight_data['result']
    output = full_features.merge(fight_data, how = 'inner', right_index = True, left_index = True)
    return output

In [18]:
test = read_from_sql("t_fight_series")
fighters_test = query_builder(test, get_all = True)
test['fight_id'] = range(1,len(test)+1)
test.head()

Unnamed: 0,r_fighter,b_fighter,win_by,last_round,date,winner,winner_boolean,result,fight_id
0,Yaroslav Amosov,Mark Lemminger,TKO,1,2020-08-21,Yaroslav Amosov,True,10,1
1,Yaroslav Amosov,David Rickels,Submission,2,2019-08-24,Yaroslav Amosov,True,7,2
2,Yaroslav Amosov,Gerald Harris,Split Decision,3,2018-07-13,Yaroslav Amosov,True,9,3
3,Yaroslav Amosov,Diogo Cavalcanti,Submission,1,2017-03-18,Yaroslav Amosov,True,7,4
4,Yaroslav Amosov,Khasanbek Abdulaev,TKO,2,2016-03-03,Yaroslav Amosov,True,10,5


In [13]:
results = create_features(test, fighters_test)
results.head()

  return self._getitem_tuple(key)


36139 fights had to be excluded due to insufficient data.


Unnamed: 0,r_losses,r_wins,r_fight_1,r_fight_2,r_fight_3,r_fight_1_days,r_fight_2_days,r_fight_3_days,b_losses,b_wins,b_fight_1,b_fight_2,b_fight_3,b_fight_1_days,b_fight_2_days,b_fight_3_days,result
1,0.0,23.0,9.0,7.0,9.0,182.0,363.0,552.0,1.0,16.0,10.0,9.0,7.0,28.0,351.0,420.0,10
2,0.0,21.0,9.0,9.0,7.0,189.0,407.0,736.0,7.0,23.0,10.0,9.0,4.0,148.0,267.0,456.0,7
3,0.0,19.0,7.0,7.0,9.0,329.0,482.0,755.0,6.0,23.0,1.0,10.0,9.0,76.0,265.0,322.0,9
4,0.0,17.0,9.0,10.0,7.0,273.0,380.0,400.0,3.0,8.0,4.0,3.0,10.0,322.0,637.0,924.0,7
7,0.0,11.0,7.0,9.0,10.0,29.0,112.0,132.0,3.0,0.0,4.0,1.0,1.0,307.0,512.0,734.0,7


In [16]:
X, y = results.iloc[:,:-1], results.iloc[:,-1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state = 0)
dec_tree = DecisionTreeClassifier().fit(X_train, y_train)
forest = RandomForestClassifier().fit(X_train, y_train)
dummy = DummyClassifier().fit(X_train, y_train)
grad_boost = GradientBoostingClassifier().fit(X_train, y_train)
print("Decision Tree: {:.2f}".format(dec_tree.score(X_test, y_test)))
print("Random Forest: {:.2f}".format(forest.score(X_test, y_test)))
print("Gradient Boosted: {:.2f}".format(grad_boost.score(X_test, y_test)))
print("Dummy Majority: {:.2f}".format(dummy.score(X_test, y_test)))



Decision Tree: 0.19
Random Forest: 0.22
Gradient Boosted: 0.29
Dummy Majority: 0.16


In [19]:
results = create_features(test, fighters_test, fights_back = 2)
results.head()

  return self._getitem_tuple(key)


31405 fights had to be excluded due to insufficient data.


Unnamed: 0,r_losses,r_wins,r_fight_1,r_fight_2,r_fight_1_days,r_fight_2_days,b_losses,b_wins,b_fight_1,b_fight_2,b_fight_1_days,b_fight_2_days,result
1,0.0,23.0,9.0,7.0,182.0,363.0,1.0,16.0,10.0,9.0,28.0,351.0,10
2,0.0,21.0,9.0,9.0,189.0,407.0,7.0,23.0,10.0,9.0,148.0,267.0,7
3,0.0,19.0,7.0,7.0,329.0,482.0,6.0,23.0,1.0,10.0,76.0,265.0,9
4,0.0,17.0,9.0,10.0,273.0,380.0,3.0,8.0,4.0,3.0,322.0,637.0,7
6,0.0,13.0,10.0,7.0,77.0,189.0,2.0,0.0,1.0,1.0,1546.0,1655.0,7


In [20]:
X, y = results.iloc[:,:-1], results.iloc[:,-1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state = 0)
dec_tree = DecisionTreeClassifier().fit(X_train, y_train)
forest = RandomForestClassifier().fit(X_train, y_train)
dummy = DummyClassifier().fit(X_train, y_train)
grad_boost = GradientBoostingClassifier().fit(X_train, y_train)
print("Decision Tree: {:.2f}".format(dec_tree.score(X_test, y_test)))
print("Random Forest: {:.2f}".format(forest.score(X_test, y_test)))
print("Gradient Boosted: {:.2f}".format(grad_boost.score(X_test, y_test)))
print("Dummy Majority: {:.2f}".format(dummy.score(X_test, y_test)))



Decision Tree: 0.19
Random Forest: 0.22
Gradient Boosted: 0.30
Dummy Majority: 0.16
