In [52]:
#import re
import datetime

import pandas as pd
import pandas.io.sql as sqlio
import numpy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import OneHotEncoder
from sqlalchemy import create_engine

import psycopg2
from sqlalchemy.engine.url import URL

#config method from config.py file.  For database access credentials
from config import config


In [2]:
#define Read, write, & query POSTGRESQL functions

def read_df_from_azure(query):
    
    params = config("database.ini")
    engine = create_engine(URL("postgresql", params["user"], params["password"], params["host"], 5432, params["dbname"]))

    # Context manager makes sure the `Connection` is closed safely and implicitly
    with engine.connect() as conn:
        
        df = pd.read_sql_query(query, conn)
        
        # print(conn.in_transaction()) # False
        # do_something_with(conn)
        
        #trans = conn.begin()
        #print(conn.in_transaction()) # True        
        # do_whatever_with(trans)
        
        #print(conn.closed) # False
    #print('Is Connection with-OUT closed?', conn.closed) # True
    engine.dispose()
    return df
    
    

    
    
def write_df_to_azure(df, table_title = "dummy_table"):
   
    params = config("database.ini")
    engine = create_engine(URL("postgresql", params["user"], params["password"], params["host"], 5432, params["dbname"]))

    with engine.connect() as conn:
        
        df.to_sql(table_title, con = engine, if_exists = "replace", method = "multi")

        conn.closed
    engine.dispose()
    
    
def execute_query_on_azure(query):
    params = config("database.ini")
    engine = create_engine(URL("postgresql", params["user"], params["password"], params["host"], 5432, params["dbname"]))

    
    with engine.connect() as connection:
        result = connection.execute(query)

        
    engine.dispose()
    return result


In [3]:
t_fighter_series = read_df_from_azure("SELECT * FROM t_fighter_series;")

print("Fighters Total Size: {}".format(t_fighter_series.shape))
t_fighter_series.tail()

Fighters Total Size: (109232, 6)


Unnamed: 0,fighter,win_by,last_round,date,winner_boolean,win_by_categorical
109227,Zvonimir Brala,KO,1,2009-10-31,False,4
109228,Zvonimir Brala,Split Decision,1,2006-09-01,True,9
109229,Zvonimir Brala,Submission,1,2005-07-17,True,7
109230,Zvonimir Kralj,Submission,1,2017-11-11,True,7
109231,Zymantas Maumevicius,TKO,1,2012-12-08,False,4


In [4]:
t_fight_series = read_df_from_azure("SELECT * FROM t_fight_series;")

print("Fights Total size: {}".format(t_fight_series.shape))
t_fight_series.head()


Fights Total size: (54616, 8)


Unnamed: 0,r_fighter,b_fighter,win_by,last_round,date,winner,winner_boolean,result
0,Yaroslav Amosov,Mark Lemminger,TKO,1,2020-08-21,Yaroslav Amosov,True,10
1,Yaroslav Amosov,David Rickels,Submission,2,2019-08-24,Yaroslav Amosov,True,7
2,Yaroslav Amosov,Gerald Harris,Split Decision,3,2018-07-13,Yaroslav Amosov,True,9
3,Yaroslav Amosov,Diogo Cavalcanti,Submission,1,2017-03-18,Yaroslav Amosov,True,7
4,Yaroslav Amosov,Khasanbek Abdulaev,TKO,2,2016-03-03,Yaroslav Amosov,True,10


In [5]:
def retrieve_fights_before_date(fighter, date):
    global t_fighter_series
    global t_fight_series
    
    all_fighter_fights = t_fighter_series.loc[t_fighter_series['fighter'] == fighter]
    date_sorted_fights = all_fighter_fights.sort_values(by = ["date"], ascending = False) #,  inplace = True
    

#     print("date 1 - date 2: ", (date_sorted_fights.iloc[0]["date"] - date_sorted_fights.iloc[1]["date"]).days)
    
    return(date_sorted_fights[date_sorted_fights["date"] < date])
    


def create_skeleton_df(num_prev_fights = 1):
    
    return(pd.DataFrame(columns = ["R_Fighter", "R_Total_Fights", "R_Win_Percentage",
                                   "R_Method_1_prev", "R_Rounds_1_prev", "R_Winner_1_prev", "R_Prev_Fight_days",
                                   "B_Fighter", "B_Total_Fights", "B_Win_Percentage",
                                   "B_Method_1_prev", "B_Rounds_1_prev", "B_Winner_1_prev", "B_Prev_Fight_days"]))
    #Rounds of previous fight?

    


def prepare_x_input_line(index):
    global t_fighter_series
    global t_fight_series 
    
    printing = False
    
    fight_row = t_fight_series.iloc[index]

    
    #get the list of most recent fights for each fighter on the fight row before the active date
    b_prev_fights = retrieve_fights_before_date(fight_row["b_fighter"], fight_row["date"]) 
    r_prev_fights = retrieve_fights_before_date(fight_row["r_fighter"], fight_row["date"]) 
    
    

    #if Empty, return an empty dict. Can improve this to enter values for empty values for win %, prev resutls and other
    if r_prev_fights.empty or b_prev_fights.empty:
        return {}
    
    #reset the indecies to be able to index fights in order
    r_prev_fights.reset_index(drop = True, inplace = True)
    b_prev_fights.reset_index(drop = True, inplace = True)

    #get the total number of fights
    r_total_fights = r_prev_fights.shape[0]
    b_total_fights = b_prev_fights.shape[0]
    
    
    if printing:
        print("Red Fighter:")
        print(r_prev_fights.head())
    
        print()
        print("Blue Fighter:")
        print(b_prev_fights.head())
    

#     print("date: {}".format(fight_row["date"]))
#     print("{} (R) Previous fights: \n{}".format(fight_row["r_fighter"], r_prev_fights))
#     print("{} (B) Previous fights: \n{}".format(fight_row["b_fighter"], b_prev_fights))

    
    #todo here:
    # 0. Think about randomly switching red / blue, or maybe about how we can utilize favorite by adding that data 
    #    from scraper
    # 1. gather each input line using one fight line using the function
    #    retrieve_fights_before_date 
    # 2. put inputs in properly formatted row
    # 3. Make sure every data has an input.  Can for now can drop lines, but see how many
    #    end up being dropped
    # 4. append row to larger X Dataset
    

    return {"R_Fighter" : fight_row["r_fighter"],
            "R_Total_Fights" : r_total_fights,
            "R_Win_Percentage" : sum(r_prev_fights["winner_boolean"] == True) / r_total_fights,
            "R_Method_1_prev" : r_prev_fights["win_by"][0],
            "R_Rounds_1_prev" : r_prev_fights["last_round"][0],
            "R_Winner_1_prev" : r_prev_fights["winner_boolean"][0],
            "R_Prev_Fight_days" : (fight_row["date"] - r_prev_fights["date"][0]).days,
            "B_Fighter" : fight_row["b_fighter"],
            "B_Total_Fights" : b_total_fights,
            "B_Win_Percentage" : sum(b_prev_fights["winner_boolean"] == True) / b_total_fights,
            "B_Method_1_prev" : b_prev_fights["win_by"][0],
            "B_Rounds_1_prev" : b_prev_fights["last_round"][0],
            "B_Winner_1_prev" : b_prev_fights["winner_boolean"][0],
            "B_Prev_Fight_days" : (fight_row["date"] - b_prev_fights["date"][0]).days,
            "Winner": fight_row["winner_boolean"],
            "Win_Method": fight_row["win_by"]}
        

# prepare_x_input_line(1)
# print(retrieve_fights_before_date("Aaron Riley", dates[7]))


In [8]:
t_fight_series.shape[0]

54616

In [9]:
#eventually make num_last fights a variable that we can tune / test like a hyperparameter
input_X = create_skeleton_df()

fight_iterations = t_fight_series.shape[0]

empty_fighters = 0
# for fight_number in range(t_fighter_series.shape[0]):
for fight_number in range(fight_iterations):
    if fight_number % 1000 == 0:
        print("Fight {}".format(fight_number + 1))
    x_row = prepare_x_input_line(fight_number)
#     print("Current x_row: ".format(x_row))
    
    if bool(x_row):
#         print("Adding Row: ", x_row)
        input_X = input_X.append(x_row, ignore_index = True)
    else:
#         print("empty Row")
        empty_fighters += 1
          
#     print()
#     print("----------------")
#     print()
    

print("Empty fights: {}".format((empty_fighters)/fight_iterations))
print()
input_X.tail()

Fight 1
Fight 1001
Fight 2001
Fight 3001
Fight 4001
Fight 5001
Fight 6001
Fight 7001
Fight 8001
Fight 9001
Fight 10001
Fight 11001
Fight 12001
Fight 13001
Fight 14001
Fight 15001
Fight 16001
Fight 17001
Fight 18001
Fight 19001
Fight 20001
Fight 21001
Fight 22001
Fight 23001
Fight 24001
Fight 25001
Fight 26001
Fight 27001
Fight 28001
Fight 29001
Fight 30001
Fight 31001
Fight 32001
Fight 33001
Fight 34001
Fight 35001
Fight 36001
Fight 37001
Fight 38001
Fight 39001
Fight 40001
Fight 41001
Fight 42001
Fight 43001
Fight 44001
Fight 45001
Fight 46001
Fight 47001
Fight 48001
Fight 49001
Fight 50001
Fight 51001
Fight 52001
Fight 53001
Fight 54001
Empty fights: 0.42482056540207996



Unnamed: 0,R_Fighter,R_Total_Fights,R_Win_Percentage,R_Method_1_prev,R_Rounds_1_prev,R_Winner_1_prev,R_Prev_Fight_days,B_Fighter,B_Total_Fights,B_Win_Percentage,B_Method_1_prev,B_Rounds_1_prev,B_Winner_1_prev,B_Prev_Fight_days,Win_Method,Winner
31409,Oleg Taktarov,2,0.5,Submission,1,True,98,Anthony Macias,1,0.0,Submission,1,False,210,Submission,1.0
31410,Oleg Taktarov,2,0.5,Submission,1,True,98,Dave Beneteau,3,0.666667,KO/TKO,1,True,98,Submission,1.0
31411,Royce Gracie,8,1.0,Submission,1,True,98,Keith Hackney,1,1.0,KO/TKO,1,True,98,Submission,1.0
31412,Royce Gracie,3,1.0,Submission,1,True,119,Patrick Smith,1,0.0,Submission,1,False,119,KO/TKO,1.0
31413,Royce Gracie,3,1.0,Submission,1,True,119,Jason DeLucia,1,1.0,Submission,1,True,119,Submission,1.0


In [19]:
input_X.to_csv("preprocessed_data.csv", index = False)

In [45]:
#Can below this runs from CSV


input_data = pd.read_csv("preprocessed_data.csv")
input_data = input_data.sample(frac = 1, random_state = 1)

input_data.head()


Unnamed: 0,R_Fighter,R_Total_Fights,R_Win_Percentage,R_Method_1_prev,R_Rounds_1_prev,R_Winner_1_prev,R_Prev_Fight_days,B_Fighter,B_Total_Fights,B_Win_Percentage,B_Method_1_prev,B_Rounds_1_prev,B_Winner_1_prev,B_Prev_Fight_days,Win_Method,Winner
14207,Angelo Popofski,13,0.538462,Submission,1,False,351,Joseph Baize,1,1.0,Submission,1,True,602,Split Decision,1.0
9387,Siyar Bahadurzada,30,0.766667,KO,2,True,252,Curtis Millender,17,0.823529,KO,2,True,351,Split Decision,0.0
2106,Joe Riggs,55,0.690909,TKO,1,True,20,Jerome Jones,22,0.545455,Submission,1,False,168,TKO,1.0
23604,Adriano Capitulino,8,0.875,Submission,3,True,259,Valdir Araujo,13,0.538462,Submission,2,True,209,Split Decision,0.0
22907,Henrique Mello,14,0.857143,Submission,2,True,447,Julio Cesar de Almeida,18,0.777778,Submission,2,True,120,Split Decision,0.0


In [46]:
input_y = input_data[["Win_Method","Winner"]]
# input_data = input_data.drop(columns = ["Win_Method", "Winner", "R_Fighter", "B_Fighter"])

winner_and_method = pd.DataFrame()
winner_and_method["Winner_and_Method"] = input_y["Win_Method"] + input_y["Winner"].astype(str)
input_y = pd.concat([input_y, winner_and_method], axis=1, join='inner')

input_y.head()


Unnamed: 0,Win_Method,Winner,Winner_and_Method
14207,Split Decision,1.0,Split Decision1.0
9387,Split Decision,0.0,Split Decision0.0
2106,TKO,1.0,TKO1.0
23604,Split Decision,0.0,Split Decision0.0
22907,Split Decision,0.0,Split Decision0.0


In [48]:
def data_clean_up(df):
#     df = df.drop(columns = ["Win_Method", "Winner", "R_Fighter", "B_Fighter"])
    df_numeric = df[["R_Total_Fights", "R_Win_Percentage", "R_Rounds_1_prev", "R_Prev_Fight_days", 
                    "B_Total_Fights", "B_Win_Percentage", "B_Rounds_1_prev", "B_Prev_Fight_days"]]
    
    scaler = MinMaxScaler()
    df_numeric_scaled = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)

    
    return pd.concat([df_numeric_scaled, df[["R_Method_1_prev", "R_Winner_1_prev", "B_Method_1_prev", "B_Winner_1_prev"]]], axis=1, join='inner')


# input_data = input_data.drop(columns = ["Win_Method", "Winner", "R_Fighter", "B_Fighter"])
input_data = data_clean_up(input_data)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(input_data, input_y, test_size=0.33, random_state=1)

In [53]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, y_train)




ValueError: could not convert string to float: 'Split Decision'

In [50]:
print("\n\n\n\n\n\n\n\n\n\n\n\n\n")















