In [1]:
import pandas as pd
import pandas.io.sql as sqlio
import psycopg2
import re
import datetime
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

#config method from config.py file
from config import config


In [2]:
#define Read, write, & query POSTGRESQL functions

def read_df_from_azure(query):
    
    params = config("database.ini")
    engine = create_engine(URL("postgresql", params["user"], params["password"], params["host"], 5432, params["dbname"]))

    # Context manager makes sure the `Connection` is closed safely and implicitly
    with engine.connect() as conn:
        
        df = pd.read_sql_query(query, conn)
        
        # print(conn.in_transaction()) # False
        # do_something_with(conn)
        
        #trans = conn.begin()
        #print(conn.in_transaction()) # True        
        # do_whatever_with(trans)
        
        #print(conn.closed) # False
    #print('Is Connection with-OUT closed?', conn.closed) # True
    engine.dispose()
    return df
    
    

    
    
def write_df_to_azure(df, table_title = "dummy_table"):
   
    params = config("database.ini")
    engine = create_engine(URL("postgresql", params["user"], params["password"], params["host"], 5432, params["dbname"]))

    with engine.connect() as conn:
        
        df.to_sql(table_title, con = engine, if_exists = "replace", method = "multi")

        conn.closed
    engine.dispose()
    
    
def execute_query_on_azure(query):
    params = config("database.ini")
    engine = create_engine(URL("postgresql", params["user"], params["password"], params["host"], 5432, params["dbname"]))

    
    with engine.connect() as connection:
        result = connection.execute(query)

        
    engine.dispose()
    return result


In [3]:
t_fighter_series = read_df_from_azure("SELECT * FROM t_fighter_series;")

print("Fighters Total Size: {}".format(t_fighter_series.shape))
t_fighter_series.head()

Fighters Total Size: (95470, 6)


Unnamed: 0,fighter,win_by,last_round,date,winner_boolean,win_by_categorical
0,Aaron Brink,TKO,1,2019-06-09,True,10
1,Aaron Brink,TKO,1,2019-06-09,True,10
2,Aaron Brink,Submission,1,2018-03-31,True,7
3,Aaron Brink,Submission,1,2018-03-31,True,7
4,Aaron Brink,Submission,1,2018-03-04,True,7


In [4]:
t_fight_series = read_df_from_azure("SELECT * FROM t_fight_series;")

print("Fights Total size: {}".format(t_fight_series.shape))
t_fight_series.head()


Fights Total size: (47735, 8)


Unnamed: 0,r_fighter,b_fighter,win_by,last_round,date,winner,winner_boolean,result
0,Andrei Arlovski,Andrei Arlovski,Submission,1,2000-11-17,Andrei Arlovski,False,1
1,Sam Sicilia,Sam Sicilia,Decision - Unanimous,3,2014-05-24,Sam Sicilia,False,2
2,Shane Nelson,Shane Nelson,KO/TKO,1,2009-03-07,Shane Nelson,False,4
3,Tony Ferguson,Tony Ferguson,TKO - Doctor's Stoppage,1,2011-09-24,Tony Ferguson,False,4
4,Joey Beltran,Joey Beltran,KO/TKO,3,2011-06-11,Joey Beltran,False,4


In [8]:
def retrieve_fights_before_date(fighter, date):
    global t_fighter_series
    global t_fight_series
    
    all_fighter_fights = t_fighter_series.loc[t_fighter_series['fighter'] == fighter]
    all_fighter_fights.sort_values(by = ["date"], ascending = False, inplace = True)
    
#     print(all_fighter_fights)
    
    return(all_fighter_fights[all_fighter_fights["date"] < date])
    


def create_skeleton_df(num_prev_fights = 1):
    
    return(pd.DataFrame(columns = ["R_Fighter", "R_Total_Fights", "R_win_percentage",
                                 "R_Method_1_prev", "R_Rounds_1_prev", "R_Winner_1_prev",
                                 "B_Figher", "B_Total_Fights", "B_win_percentage"
                                 "B_Method_1_prev", "B_Rounds_1_prev", "B_Winner_1_prev"]))
    

    


def prepare_x_input_line(index):
    global t_fighter_series
    global t_fight_series 
    
    printing = True
    
    fight_row = t_fight_series.iloc[index]
    
    
    #get the list of fights for each fighter on the fight row
    b_prev_fights = retrieve_fights_before_date(fight_row["b_fighter"], fight_row["date"]) 
    r_prev_fights = retrieve_fights_before_date(fight_row["r_fighter"], fight_row["date"]) 
    
    

#     print("date: {}".format(fight_row["date"]))
#     print("{} (R) Previous fights: \n{}".format(fight_row["r_fighter"], r_prev_fights))
#     print("{} (B) Previous fights: \n{}".format(fight_row["b_fighter"], b_prev_fights))

#     print("date: {}".format(fight_row["date"]))
#     print("{} (R) Previous fights: \n{}".format(fight_row["r_fighter"], r_prev_fights))
#     print("{} (B) Previous fights: \n{}".format(fight_row["b_fighter"], b_prev_fights))

    #if Empty, return an empty dict. Can improve this to enter values for empty values for win %, prev resutls and other
    if r_prev_fights.empty or b_prev_fights.empty:
        return {}
    
    #reset the indecies to be able to index fights in order
    r_prev_fights.reset_index(drop = True, inplace = True)
    b_prev_fights.reset_index(drop = True, inplace = True)

    #get the total number of fights
    r_total_fights = r_prev_fights.shape[0]
    b_total_fights = b_prev_fights.shape[0]
    
    
    if printing:
        print("Red Fighter:")
        print(r_prev_fights.head())
    
        print()
        print("Blue Fighter:")
        print(b_prev_fights.head())
    

#     print("date: {}".format(fight_row["date"]))
#     print("{} (R) Previous fights: \n{}".format(fight_row["r_fighter"], r_prev_fights))
#     print("{} (B) Previous fights: \n{}".format(fight_row["b_fighter"], b_prev_fights))

    
    #todo here:
    # 0. Think about randomly switching red / blue, or maybe about how we can utilize favorite by adding that data 
    #    from scraper
    # 1. gather each input line using one fight line using the function
    #    retrieve_fights_before_date 
    # 2. put inputs in properly formatted row
    # 3. Make sure every data has an input.  Can for now can drop lines, but see how many
    #    end up being dropped
    # 4. append row to larger X Dataset
    

    return {"R_Fighter" : fight_row["r_fighter"],
            "R_Total_Fights" : r_total_fights,
            "R_win_percentage" : sum(r_prev_fights["winner_boolean"] == True) / r_total_fights,
            "R_Method_1_prev" : r_prev_fights["win_by"][0],
            "R_Rounds_1_prev" : r_prev_fights["last_round"][0],
            "R_Winner_1_prev" : r_prev_fights["winner_boolean"][0],
            "B_Figher" : fight_row["b_fighter"],
            "B_Total_Fights" : b_total_fights,
            "B_win_percentage" : sum(b_prev_fights["winner_boolean"] == True) / b_total_fights,
            "B_Method_1_prev" : b_prev_fights["win_by"][0],
            "B_Rounds_1_prev" : b_prev_fights["last_round"][0],
            "B_Winner_1_prev" : b_prev_fights["winner_boolean"][0]}
        

# prepare_x_input_line(1)
# print(retrieve_fights_before_date("Aaron Riley", dates[7]))


In [12]:
#eventually make num_last fights a variable that we can tune / test like a hyperparameter
input_X = create_skeleton_df()

fight_iterations = 3

empty_fighters = 0
# for fight_number in range(t_fighter_series.shape[0]):
for fight_number in range(fight_iterations):
    x_row = prepare_x_input_line(fight_number)
    print("Current x_row: ".format(x_row))
    
    if not bool(x_row):
        input_X = input_X.append(x_row, ignore_index = True)
    else:
        empty_fighters += 1
          
    print()
    print("----------------")
    print()
    

print("Empty fights %: {}".format((fight_iterations - empty_fighters)/fight_iterations))
print()
input_X.head()

Current x_row: 

----------------

Red Fighter:
       fighter                win_by  last_round        date  winner_boolean  \
0  Sam Sicilia  Decision - Unanimous           3  2013-04-13           False   
1  Sam Sicilia  Decision - Unanimous           3  2013-04-13           False   

   win_by_categorical  
0                   2  
1                   2  

Blue Fighter:
       fighter                win_by  last_round        date  winner_boolean  \
0  Sam Sicilia  Decision - Unanimous           3  2013-04-13           False   
1  Sam Sicilia  Decision - Unanimous           3  2013-04-13           False   

   win_by_categorical  
0                   2  
1                   2  
Current x_row: 

----------------

Red Fighter:
        fighter            win_by  last_round        date  winner_boolean  \
0  Shane Nelson  Decision - Split           3  2008-12-13            True   
1  Shane Nelson  Decision - Split           3  2008-12-13            True   

   win_by_categorical  
0      

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,R_Fighter,R_Total_Fights,R_win_percentage,R_Method_1_prev,R_Rounds_1_prev,R_Winner_1_prev,B_Figher,B_Total_Fights,B_win_percentageB_Method_1_prev,B_Rounds_1_prev,B_Winner_1_prev
0,,,,,,,,,,,


In [53]:
#many of the fighters have not been parsed and just have the one fight listed.  Run a quick check
#to count what percentage are missing
fight_row = t_fight_series.iloc[400]

print(fight_row["date"])
    
    #get the list of fights for each fighter on the fight row
#     b_prev_fights = retrieve_fights_before_date(fight_row["b_fighter"], fight_row["date"]) 
print(retrieve_fights_before_date("Ariane Lipski", fight_row["date"]))



2019-01-19
           fighter                win_by  last_round        date  \
745  Ariane Lipski  Decision - Unanimous           3  2019-01-19   

     winner_boolean  win_by_categorical  
745           False                   2  
Empty DataFrame
Columns: [fighter, win_by, last_round, date, winner_boolean, win_by_categorical]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [30]:
df_test = retrieve_last_X_fights("Aaron Riley")

dates = df_test["date"]

print(dates)


if dates[4] > dates[5]:
    print("Dates 4: {}".format(dates[4]))
    print("Dates 5: {}".format(dates[5]))


NameError: name 'retrieve_last_X_fights' is not defined

In [25]:
for date in dates:
    if t_fighter_series.iloc[4]["date"] < date:
        print(t_fighter_series.iloc[4]["date"])
        break

(t_fighter_series.iloc[4]["date"])

2011-09-24


datetime.date(2011, 9, 24)

In [11]:
t_fighter_series.iloc[123]["date"]

datetime.date(2014, 8, 16)

In [13]:
retrieve_fights_before_date("Sam Hoger", t_fighter_series.iloc[123]["date"])

Unnamed: 0,fighter,win_by,last_round,date,winner_boolean,win_by_categorical
8706,Sam Hoger,Decision - Unanimous,3,2007-02-03,False,2
8707,Sam Hoger,Decision - Split,3,2006-04-06,False,3
8708,Sam Hoger,Submission,2,2005-11-19,True,7
8709,Sam Hoger,Decision - Unanimous,3,2005-08-06,False,2
8710,Sam Hoger,Decision - Unanimous,3,2005-04-09,True,8
86062,Sam Hoger,TKO,1,2010-04-09,True,10
86063,Sam Hoger,Split Decision,3,2007-02-03,False,3
86064,Sam Hoger,Split Decision,3,2005-04-09,True,9


In [26]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))

df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df.append(df2)

df.append([10, 11], ignore_index = True)

Unnamed: 0,A,B,0
0,1.0,2.0,
1,3.0,4.0,
2,,,10.0
3,,,11.0
