In [15]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import RFECV
import gzip
import dill
import math

In [16]:
def serialize_model(file_name):
    """Serialize the trained machine learning model.

    Parameters
    ----------
    file_name : str (default='____.dill')
        File name to use when persisting trained model.
    """""

    model = train_model()
    with gzip.open(file_name, 'wb') as f:
        dill.dump(model, f)

In [17]:
def binarize(result):
    if result == 'W':
        return 1
    elif result == 'L':
        return 0

In [18]:
def dateify(date):
    return datetime.strptime(date, '%Y-%m-%d')

In [26]:
dateify('1990-12-20')

datetime.datetime(1990, 12, 20, 0, 0)

In [19]:
def train_model():
    """Train a machine learning model to predict UFC fight winner.

    Returns
    -------
    best_model : scikit-learn trained classifier
        Returns the best model found through tuning the hyperparameters.
        """
    cwd = os.getcwd()
    fightdata = pd.read_csv(cwd + '/UFCstats.csv', engine='python')
    fightdata = fightdata[:int(len(fightdata)/2)]
    fightdata.dropna(subset=['Date'], inplace=True)
    fightdata.dropna(subset=['F1_DOB'], inplace=True)
    fightdata.dropna(subset=['F2_DOB'], inplace=True)
    fightdata['DateTime'] = fightdata.apply(lambda row: dateify(row.Date), axis=1)
    fightdata['F1_dob_datetime'] = fightdata.apply(lambda row: dateify(row.F1_DOB), axis=1)
    fightdata['F2_dob_datetime'] = fightdata.apply(lambda row: dateify(row.F2_DOB), axis=1)

    # drop if date before -> UFC 21?
    dt = datetime.strptime('July 16, 1999', '%B %d, %Y')
    fightdata = fightdata[fightdata.DateTime >= dt]

    # drop location, F2_result, Date
    fights = fightdata.drop(['F1_DOB', 'F2_DOB', 'F1_profile_url', 'F2_profile_url',
                             'F2_result', 'Date', 'Location', 'DateTime', 
                             'Attendance'], axis=1).dropna()

    # change F1_result to binary_result column
    fights['BinaryResult'] = fights.apply(lambda row: binarize(row.F1_result), axis=1)
    fights.dropna(subset=['BinaryResult'], inplace=True)
    # will drop binary result inplace later

    RESULTS = fights['BinaryResult'].values.tolist() # predictor variable y
    fights = fights.drop(columns=['BinaryResult']) # now drop

    # split into two dfs
    f1_df = fights.filter(regex='^F1')
    f2_df = fights.filter(regex='^F2')

    # get averages from matches NOT the current put into two lists
    f1_predata = []
    for i, row in f1_df.iterrows():
        f1_predata.append(f1_df.drop(i).mean().tolist())
    print('F1_df done')
    f2_predata = []
    for i, row in f2_df.iterrows():
        f2_predata.append(f2_df.drop(i).mean().tolist())
    print('F2_df done')
    
    # subtract lists to get diff between F1 averages and F2 averages before match
    DATA = np.subtract(f1_predata, f2_predata)

    X_train, X_test, y_train, y_test = train_test_split(DATA, RESULTS, test_size=0.20, random_state=42)

    logistic_classifier = LogisticRegressionCV(cv=5, max_iter=1000)
    logistic_classifier.fit(X_train, y_train)
    print(logistic_classifier.score(X_test, y_test))
   
    return logistic_classifier

In [20]:
def deploy_model(file_name='UFC_logistic_W-L_model.dill.gz'):
    """Return the loaded trained model.

    Parameters
    ----------
    file_name : str (default='UFC_logistic_WL_model.dill.gz')
        File name to use when persisting trained model.

    Returns
    -------
    model : scikit-learn trained classifier
        Returns the serialized trained model.
    """

    # if the model has not been persisted, create it
    try:
        with gzip.open(file_name, 'rb') as f:
            model = dill.load(f)
    except FileNotFoundError:
        print("Trained model not found, creating the file.")
        serialize_model(file_name)
        return deploy_model(file_name=file_name)
    
    return model

In [21]:
def focus_only_stats(df):
    return df[['F1_Height', 'F1_Weight', 'F1_Reach',  'F1_DOB', 'F1_KD', 'F1_SS_hit',
       'F1_SS_att', 'F1_totalStrikes_hit', 'F1_totalStrikes_att',
       'F1_TD_conv', 'F1_TD_att', 'F1_Sub', 'F1_pass', 'F1_rev',
       'F1_head_hit', 'F1_head_att', 'F1_body_conv', 'F1_body_att',
       'F1_leg_conv', 'F1_leg_att', 'F1_distance_conv', 'F1_distance_att', 
       'F1_clinch_conv', 'F1_clinch_att', 'F1_ground_conv', 'F1_ground_att']]

In [47]:
def get_preds():
    f1_name = input('Fighter 1: ')
    f2_name = input('Fighter 2: ')
    cwd = os.getcwd()
    df = pd.read_csv(cwd + '/UFCstats.csv', engine='python')
    f1_df = df[df['Fighter1'] == f1_name]
    f2_df = df[df['Fighter1'] == f2_name]
    f1_less = focus_only_stats(f1_df)
    f2_less = focus_only_stats(f2_df)
    f1_ewm = f1_less.ewm(alpha=0.5).mean().iloc[[-1]].values
    f2_ewm = f2_less.ewm(alpha=0.5).mean().iloc[[-1]].values
    f1_ewm[0][3] = dateify(f1_ewm[0][3])
    f2_ewm[0][3] = dateify(f2_ewm[0][3])
    loaded_model = deploy_model()
    subbed = np.subtract(f1_ewm, f2_ewm)
    print(loaded_model.predict(subbed))
#     if loaded_model.predict(subbed)[0] == 1.0:
#         print('Predicted winner: ', f1_name)
#     elif loaded_model.predict(subbed)[0] == 0.0:
#         print('Predicted winner: ', f2_name)
#     #print('Predicted result: ', loaded_model.predict(subbed)[0])
#     #print('Chance of ', f1_name, ' winning: ', 
#          # str(loaded_model.predict_proba(subbed)[0,1]*100) + '%')

In [48]:
get_preds()

Fighter 1: Robert Whittaker
Fighter 2: Israel Adesanya


TypeError: float() argument must be a string or a number, not 'datetime.timedelta'

In [49]:
'2018-08-10' - '2018-04-12'

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [16]:
df = pd.read_csv('UFCstats.csv')

In [19]:
df[df.Fighter1.startswith('Khalil')]

AttributeError: 'Series' object has no attribute 'startswith'

In [25]:
df['F1_Reach'].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
10469     True
10470     True
10471     True
10472     True
10473     True
Name: F1_Reach, Length: 10474, dtype: bool

In [30]:
count = 0
for i, row in df.iterrows():
    if math.isnan(row['F1_Reach']):
        df.loc[i, 'F1_Reach'] = df.loc[i, 'F1_Height']
        count += 1
        
print(count)

924


In [37]:
len(df[df['F1_Reach'].isna()])

9

In [34]:
df[df['F1_Reach'].isna()]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Fighter1,F1_Height,F1_Weight,F1_Reach,F1_DOB,F1_profile_url,Fighter2,F2_Height,...,F1_clinch_att,F2_clinch_conv,F2_clinch_att,F1_ground_conv,F1_ground_att,F2_ground_conv,F2_ground_att,Date,Location,Attendance
3730,3730,3730,Tom Blackledge,,205.0,,,http://www.ufcstats.com/fighter-details/2adb11...,Anthony Perosh,75.0,...,0,0,0,4,4,4,7,2011-02-26,"Sydney, New South Wales, Australia",18186.0
5122,5122,5122,Jack Nilson,,,,,http://www.ufcstats.com/fighter-details/53e533...,Saeed Hosseini,,...,0,0,2,11,11,0,0,1997-05-30,"Augusta, Georgia, USA",5100.0
5218,5218,5218,Frank Hamaker,,,,,http://www.ufcstats.com/fighter-details/c3c23c...,Thaddeus Luster,75.0,...,0,0,0,1,2,0,0,1994-03-11,"Denver, Colorado, USA",2000.0
5427,5427,5427,Mike Davis,,,,,http://www.ufcstats.com/fighter-details/c8661e...,Gilbert Burns,70.0,...,2,4,5,0,0,17,20,2019-04-27,"Sunrise, Florida, USA",12754.0
10322,10322,10322,Noe Hernandez,,,,,http://www.ufcstats.com/fighter-details/df2cf6...,Chuck Liddell,74.0,...,12,21,36,1,1,5,10,1998-05-15,"Mobile, Alabama, USA",4200.0
10350,10350,10350,Sam Fulton,,,,,http://www.ufcstats.com/fighter-details/1f5f75...,Alex Hunter,69.0,...,0,0,0,0,0,5,9,1997-07-27,"Birmingham, Alabama, USA",4800.0
10359,10359,10359,Saeed Hosseini,,,,,http://www.ufcstats.com/fighter-details/21f297...,Jack Nilson,,...,2,0,0,0,0,11,11,1997-05-30,"Augusta, Georgia, USA",5100.0
10439,10439,10439,Felix Lee Mitchell,,,,,http://www.ufcstats.com/fighter-details/6cbb76...,Ken Shamrock,73.0,...,3,3,3,0,0,1,1,1994-09-09,"Charlotte, North Carolina, USA",
10457,10457,10457,Ray Wizard,,,,,http://www.ufcstats.com/fighter-details/ea0ad1...,Patrick Smith,74.0,...,0,1,1,0,0,0,0,1994-03-11,"Denver, Colorado, USA",2000.0


In [35]:
df.to_csv('UFCstats.csv', index=False)