In [2]:
from UFCStatsGUI.database.DBInterface import DBInterface
from UFCStatsGUI.database.util import functions as f
import configs as cfg

import hashlib

In [3]:
def _replace_na(df, col, val=None):

    '''
    Replace NA values in a column with a value
        df: dataframe
        col: column name
        val: value to replace NA with
    Returns dataframe
    '''
    import numpy as np
    df[col] = df[col].fillna(val)
    df[col] = df[col].replace(np.nan, val)
    df[col] = df[col].replace('NULL', val)
    df[col] = df[col].replace('---', val)
    return df


def parse_total_time(df, tf_col='TimeFormat'):
    '''
    Parse the total time of a fight
        df: dataframe
        tf_col: column name of the time format
    Returns dataframe
    '''
    import numpy as np
    df['PossibleRounds'] = np.where(df[tf_col] == '5 Rnd (5-5-5-5-5)', 5, 3)
    df['PossibleTime'] = np.where(df[tf_col] == '5 Rnd (5-5-5-5-5)', 25 * 60, 15 * 60)
    df = df.drop([tf_col], axis=1)
    return df


def split_col(df, old_col, sep=' ', new_col=None, suffix=['LND', 'ATMPT'],  drop=True, func=None):
    '''
    Split a column into two columns based on a separator
        df: dataframe
        old_col: column name to split
        sep: separator
        new_col: new column name
        suffix: suffix for new column names
        drop: drop old column
        func: function to apply to new columns
    Returns dataframe
    '''
    split_col = df[old_col].str.split(sep, )
    new_cols = None
    if new_col is None:
        new_col = old_col
    if suffix is None:
        suffix = [i for i in range(len(split_col[0]))]
    new_cols = [f"{new_col}_{s}" for s in suffix]
    for i in range(len(new_cols)):
        df[new_cols[i]] = split_col.str[i]
    for col in df[new_cols]:
        if func is not None:
            df[col] = df[col].apply(func)
    if drop:
        df = df.drop([old_col], axis=1)
    return df


def convert_percent_columns(df):
    '''
    Convert columns with 'pct' in col name to float
        df: dataframe
    Returns dataframe
    '''
    percent_cols = [col for col in df.columns if 'pct' in col.lower()]
    for col in percent_cols:
        try:
            df[col] = df[col].str.replace('%', '').astype(float) / 100
        except AttributeError:
            pass
    return df


def convert_to_secs(time):
    '''
    Convert time of the format 'MM:SS' to seconds
        time: time string
    Returns seconds
    '''
    if time is None:
        return 0
    try:
        time = time.split(':')
        return int(time[0]) * 60 + int(time[1])
    except ValueError:
        return 0

    
def total_fight_time(row):
    '''
    Calculate the total fight time based on the round and time
        row: row of dataframe
    Returns total fight time in seconds
    '''
    if row['Time'] == '5:00' and int(row['Round'])==row['PossibleRounds']:
        return row['PossibleTime']
    elif row['Round'] == '1':
        return convert_to_secs(row['Time'])
    else:
        rnds_done = int(row['Round']) - 1
        return (rnds_done * 5 * 60) + convert_to_secs(row['Time'])



def convert_to_int(df, cols):
    '''
    Convert columns to int
        df: dataframe
        cols: list of columns to convert
    Returns dataframe
    '''
    for col in cols:
        df[col] = df[col].astype(int)
    return df


def replace_na(df):
    '''
    replace NA values in dataframe with 0 for numeric columns and '' for string columns
        df: dataframe
    Returns dataframe
    '''
    dtypes = ['int32', 'float64', 'int64']
    for col in df.columns:
        if df[col].dtype in dtypes:
            df[col] = df[col].fillna(0)
        else:
            df[col] = df[col].fillna('')
    return df


def rename_cols(df, old_cols, new_cols):
    '''
    Rename columns in dataframe
        df: dataframe
        old_cols: list of old column names or regex
        new_cols: list of new column names
    Returns dataframe
    '''
    import re
    df.columns = [re.sub(old_cols, new_cols, col) for col in df.columns]
    return df


def convert_to_float(df, cols):
    '''
    Convert columns to float
        df: dataframe
        cols: list of columns to convert
    Returns dataframe
    '''
    for col in cols:
        df[col] = df[col].astype(float)
    return df


def convert_to_inches(value):
    '''
    Convert height to inches
        value: height string
    Returns height in inches
    '''
    import re
    if value is None:
        return 0
    try:
        value = re.findall(r'\d+', value)
        return int(value[0]) * 12 + int(value[1])
    except ValueError:
        return 0


def prioritize_columns(df, col_list):
    '''
    Move columns to the front of the dataframe
        df: dataframe
        col_list: list of columns to move to the front
    Returns dataframe
    '''
    other_cols = [col for col in df.columns if col not in col_list]
    return df[col_list + other_cols]

def reorder_list(lst, prio_cols, deprio=False):
    '''
    Similar to prioritize_columns, but for lists
        lst: list
        prio_cols: list of columns to move to the front
        deprio: reverse the list
    Returns list
    '''
    new_col_list = [col for col in lst if col in prio_cols] + [col for col in lst if col not in prio_cols]
    print(new_col_list)
    if deprio:
        return new_col_list[::-1]
    return new_col_list


def convert_to_bool(df, cols):
    '''
    Convert columns to boolean type
        df, dataframe
        cols: list of columns to convert
    Returns dataframe
    '''
    for col in cols:
        df[col] = df[col].astype(bool)
    return df

def hashify(df, cols, new_col_name = 'hashed'):
    '''
    Hashify columns in dataframe and create new column with hashed value
        df: dataframe
        cols: list of columns to hash
        new_col_name: name of new column
    Returns dataframe
    '''
    df[new_col_name] = df[cols].apply(lambda x: ''.join(x.dropna().astype(str)), axis=1)
    df[new_col_name] = df[new_col_name].apply(lambda x: hashlib.sha256(str(x).encode('utf-8')).hexdigest())
    return df


def separate_fighters(df):
    '''
    Separate fighters into two separate dataframes, one for each fighter, then merge them back together with the same columns
        df: dataframe
    Returns dataframe
    '''
    import pandas as pd
    f1_cols = [col for col in df.columns if 'f2' not in col.lower()] + ['F2', 'F2_NICKNAME']
    f1_df = df[f1_cols].rename({'F2': 'OPP_NAME', 'F2_NICKNAME':'OPP_NICKNAME'}, axis=1)
    f1_df.columns = [col.replace('F1','F') if 'F1' in col else col for col in f1_df.columns]

    f2_cols = [col for col in df.columns if 'f1' not in col.lower()] + ['F1', 'F1_NICKNAME']
    f2_df = df[f2_cols].rename({'F1': 'OPP_NAME', 'F1_NICKNAME':'OPP_NICKNAME'}, axis=1)
    f2_df.columns = [col.replace('F2','F') if 'F2' in col else col for col in f2_df.columns]

    return pd.concat([f1_df, f2_df], axis=0)

def fbv(df, col, val):
    '''Find by value'''
    return df[df[col] == val]

def rms(x):
    '''Root mean square'''
    import numpy as np
    rms = np.sqrt(np.mean(x**2))
    return rms

def first_last_split(df, col, sep=' '):
    '''
    Split first and last name into two columns
        df: dataframe
        col: column to split
        sep: separator
    Returns dataframe
    '''
    drop_words = ["junior","de","dos","da","jr.","jr","júnior","júnior",None,"del","van","von",]
    df['FIRST_NAME'] = df[col].str.lower().apply(lambda x: [word for word in x.split(sep) if word not in drop_words][0])
    df['LAST_NAME'] = df[col].str.lower().apply(lambda x: [word for word in x.split(sep) if word not in drop_words][-1])
    return df
# drop_words = ["junior","de","dos","da","jr.","jr","júnior","júnior",None,"del","van","von",]
# first_last_split(combatants_df, 'F_NAME')

In [53]:
db = DBInterface(cfg.DB_PATH / 'ufc_raw')
bouts_df = db.Pdf('fights')
db.close()

bouts_df = hashify(bouts_df, ['Fighter1', 'Fighter2', 'EventName'], 'FightHash')

bouts_df = bouts_df.loc[bouts_df['TimeFormat'].isin(['3 Rnd (5-5-5)', '5 Rnd (5-5-5-5-5)'])].reset_index(drop=True)

bouts_df = bouts_df.replace('-{2,}', None, regex=True)
bouts_df = bouts_df.replace(r'"', '', regex=True)

bouts_df['Judge1_Score'] = bouts_df['Judge1_Score'].fillna(0)
bouts_df['Judge2_Score'] = bouts_df['Judge2_Score'].fillna(0)
bouts_df['Judge3_Score'] = bouts_df['Judge3_Score'].fillna(0)

bouts_df = parse_total_time(bouts_df)
bouts_df = split_col(bouts_df, 'Judge1_Score', ' - ', 'Judge1', ['F1Score', 'F2Score'], func=float)
bouts_df = split_col(bouts_df, 'Judge2_Score', ' - ', 'Judge2', ['F1Score', 'F2Score'], func=float)
bouts_df = split_col(bouts_df, 'Judge3_Score', ' - ', 'Judge3', ['F1Score', 'F2Score'], func=float)

of_columns = ['SIG_STR', 'TOTAL_STR', 'TD', 'HEAD', 'BODY', 'LEG', 'DISTANCE', 'CLINCH', 'GROUND']
for col in of_columns:
    f1, f2 = f'Fighter1_{col}', f'Fighter2_{col}'
    bouts_df = split_col(bouts_df, f1, ' of ', f'F1_{col}', func=int)
    bouts_df = split_col(bouts_df, f2, ' of ', f'F2_{col}', func=int)

bouts_df = convert_percent_columns(bouts_df)

bouts_df['Fighter1_CTRL'] = bouts_df['Fighter1_CTRL'].apply(convert_to_secs)
bouts_df['Fighter2_CTRL'] = bouts_df['Fighter2_CTRL'].apply(convert_to_secs)

bouts_df['Duration'] = bouts_df.apply(total_fight_time, axis=1)

bouts_df = convert_to_int(bouts_df, ['Fighter1_KD', 'Fighter2_KD', 'Fighter1_SUB_ATT', 'Fighter2_SUB_ATT', 'Fighter1_REV', 'Fighter2_REV'])

bouts_df = replace_na(bouts_df)

bouts_df.columns = [col.upper() for col in bouts_df.columns]
bouts_df = rename_cols(bouts_df, 'FIGHTER', 'F')

bouts_df = separate_fighters(bouts_df).reset_index(drop=True)

prio_cols = ['F', 'OPP_NAME']
bouts_df = prioritize_columns(bouts_df, prio_cols)

bouts_df = convert_to_bool(bouts_df, ['PERF_BONUS', 'FIGHT_BONUS', 'SUB_BONUS', 'KO_BONUS'])



bouts_df['WINNER'] = bouts_df['WINNER'] == bouts_df['F']
bouts_df['WINNER'] 

bouts_df = prioritize_columns(bouts_df, ['F', 'WINNER', 'OPP_NAME', 'WEIGHT_CLASS', 'METHOD', 'ROUND', 'DURATION', ])

bouts_df['SUBMISSION'] = (bouts_df['METHOD'] == 'Submission').astype(int)

bouts_df

UFCStatsGUI.database.DBInterface


Unnamed: 0,F,WINNER,OPP_NAME,WEIGHT_CLASS,METHOD,ROUND,DURATION,EVENTNAME,TIME,REFEREE,...,F_LEG_LND,F_LEG_ATMPT,F_DISTANCE_LND,F_DISTANCE_ATMPT,F_CLINCH_LND,F_CLINCH_ATMPT,F_GROUND_LND,F_GROUND_ATMPT,OPP_NICKNAME,SUBMISSION
0,Sergey Morozov,True,Journey Newson,Bantamweight Bout,Decision - Unanimous,3,900,UFC Fight Night: Cannonier vs. Strickland,5:00,Chris Tognoni,...,15,17,16,88,9,11,5,6,,0
1,David Dvorak,False,Manel Kape,Flyweight Bout,Decision - Unanimous,3,900,UFC Fight Night: Cannonier vs. Strickland,5:00,Keith Peterson,...,10,14,31,91,1,2,1,1,StarBoy,0
2,Said Nurmagomedov,True,Saidyokub Kakhramonov,Bantamweight Bout,Submission,2,530,UFC Fight Night: Cannonier vs. Strickland,3:50,Keith Peterson,...,1,1,3,12,4,6,0,0,,1
3,Rafa Garcia,True,Maheshate,Lightweight Bout,Decision - Unanimous,3,900,UFC Fight Night: Cannonier vs. Strickland,5:00,Mark Smith,...,9,11,28,64,11,13,10,21,,0
4,Bryan Battle,False,Rinat Fakhretdinov,Welterweight Bout,Decision - Unanimous,3,900,UFC Fight Night: Cannonier vs. Strickland,5:00,Herb Dean,...,1,1,3,16,0,0,0,0,Gladiator,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13387,Tom Breese,True,Daniel Kelly,Middleweight Bout,KO/TKO,1,213,UFC Fight Night: Thompson vs. Till,3:33,Daniel Movahedi,...,0,0,22,42,0,0,1,2,,0
13388,Makwan Amirkhani,True,Jason Knight,Featherweight Bout,Decision - Split,3,900,UFC Fight Night: Thompson vs. Till,5:00,Marc Goddard,...,1,1,18,48,0,0,9,14,The Kid,0
13389,Mads Burnell,False,Arnold Allen,Featherweight Bout,Submission,3,761,UFC Fight Night: Thompson vs. Till,2:41,Neil Hall,...,4,6,16,48,1,1,3,7,Almighty,1
13390,Darren Till,True,Stephen Thompson,Welterweight Bout,Decision - Unanimous,5,1500,UFC Fight Night: Thompson vs. Till,5:00,Marc Goddard,...,15,27,34,121,4,5,0,0,Wonderboy,0


In [62]:
import re

db = DBInterface(cfg.DB_PATH / 'ufc_raw')
combatants_df = db.Pdf("fighters")
db.close()

combatants_df = combatants_df.replace("-{2,}", None, regex=True)

combatants_df = first_last_split(combatants_df, "Fighter_Name")


parsed_record = (
    combatants_df["Record"].str.findall(r"(\d+)-(\d+)-(\d+)").apply(lambda x: x[0])
)
combatants_df["WINS"] = parsed_record.apply(lambda x: x[0]).astype(int)
combatants_df["LOSSES"] = parsed_record.apply(lambda x: x[1]).astype(int)
combatants_df["DRAWS"] = parsed_record.apply(lambda x: x[2]).astype(int)
combatants_df.drop("Record", axis=1, inplace=True)

combatants_df["HEIGHT"] = combatants_df["Height"].apply(convert_to_inches)
combatants_df.drop("Height", axis=1, inplace=True)
combatants_df
combatants_df["Listed_Weight"] = (
    combatants_df["Listed_Weight"].str.extract(r"(\d+)").astype(float)
)
combatants_df["Reach"] = combatants_df["Reach"].str.extract(r"(\d+)").astype(float)

percent_cols = ["SSAcc", "SSDef", "TDAcc", "TDDef"]
for col in percent_cols:
    combatants_df[col] = combatants_df[col].str.extract(r"(\d+)").astype(float) / 100

combatants_df = rename_cols(combatants_df, 'Fighter', 'F')

float_cols = ["SSLpM", "SSApM", "TDavg", "SubAvg"]
combatants_df = convert_to_float(combatants_df, float_cols)

combatants_df.columns = [col.upper() for col in combatants_df.columns]

combatants_df = replace_na(combatants_df)
# combatants_df.replace('', None, inplace=True)


combatants_df = prioritize_columns(
    combatants_df,
    [
        "F_NAME",
        "NICKNAME",
        "DOB",
        "HEIGHT",
        "WINS",
        "LOSSES",
        "DRAWS",
        "STANCE",
        "LISTED_WEIGHT",
        "REACH",
    ],
)
combatants_df.replace(['', ' '], None, inplace=True)
combatants_df.isna().sum(axis=1).value_counts()
test = combatants_df[combatants_df[['SSLPM', 'SSACC', 'SSAPM', 'SSDEF', 'TDAVG', 'TDACC', 'TDDEF', 'SUBAVG']].sum(axis=1) == 0]

cols = ['F_SIG_STR_LND', 'F_SIG_STR_ATMPT', 'F_TOTAL_STR_LND',
        'F_TOTAL_STR_ATMPT', 'F_TD_LND', 'F_TD_ATMPT', 
        'F_HEAD_LND', 'F_HEAD_ATMPT', 'F_BODY_LND',
        'F_BODY_ATMPT', 'F_LEG_LND', 'F_LEG_ATMPT', 
        'F_DISTANCE_LND', 'F_DISTANCE_ATMPT', 'F_CLINCH_LND',
        'F_CLINCH_ATMPT', 'F_GROUND_LND', 'F_GROUND_ATMPT']

# bouts_df[bouts_df.F.isin(test.F_NAME)].sort_values('F')
l = bouts_df.groupby('F')[cols].sum()
l['total'] = l.sum(axis=1)

UFCStatsGUI.database.DBInterface


In [52]:
import numpy as np
import pandas as pd

db = DBInterface(cfg.DB_PATH / 'ufc_raw')
ufc_fighters = db.Pdf("officialufcfighters")
db.close()

ufc_fighters = ufc_fighters.replace("-{2,}", None, regex=True)
ufc_fighters = ufc_fighters.replace(r'"', '', regex=True)
ufc_fighters['Standing_Strikes'] = ufc_fighters['Standing_Strikes'].str.findall(r'(\d+)(?!\d* ?%)').apply(lambda x: int(x[0]) if x is not None else None)
ufc_fighters['Clinch_Strikes'] = ufc_fighters['Clinch_Strikes'].str.findall(r'(\d+)(?!\d* ?%)').apply(lambda x: int(x[0]) if x is not None else None)
ufc_fighters['Ground_Strikes'] = ufc_fighters['Ground_Strikes'].str.findall(r'(\d+)(?!\d* ?%)').apply(lambda x: int(x[0]) if x is not None else None)
ufc_fighters['Gender'] = np.where(ufc_fighters['Division'].str.lower().str.contains('women'), 'F', 'M')
ufc_fighters['Division_Rank'] = ufc_fighters['Division'].str.extract('(\d+)')
ufc_fighters['Division'] = ufc_fighters['Division'].str.findall(r"((?:Women's )?\w+)(?= Division)").apply(lambda x: x[0] if x is not None else None)
ufc_fighters['PFP_Rank'] = ufc_fighters['PFP_Rank'].str.extract('(\d+)')
ufc_fighters.columns = ufc_fighters.columns.str.upper()
ufc_fighters = replace_na(ufc_fighters)
ufc_fighters = first_last_split(ufc_fighters, 'FIGHTER_NAME')
w_l_d = ufc_fighters['RECORD'].str.extract(r'(\d+)-(\d+)-(\d+)')
w_l_d.columns = ['WINS', 'LOSSES', 'DRAWS']
ufc_fighters = pd.concat([ufc_fighters, w_l_d], axis=1)
ufc_fighters
ufc_fighters[ufc_fighters.duplicated(['FIGHTER_NAME', 'NICKNAME'], False)].sort_values('FIGHTER_NAME');

UFCStatsGUI.database.DBInterface


In [155]:
import pandas as pd
x = bouts_df[['F', 'F_NICKNAME']].sort_values('F').drop_duplicates()
y = combatants_df[['F_NAME', 'NICKNAME']].sort_values('F_NAME').drop_duplicates()
z = ufc_fighters[['FIGHTER_NAME', 'NICKNAME']].sort_values('FIGHTER_NAME').drop_duplicates()
max_df = max([x, y, z], key=lambda x: x.shape[0])
max_df;

In [156]:
bouts_df = hashify(bouts_df, ['F', 'F_NICKNAME'], 'FIGHTER_ID')
bouts_df = hashify(bouts_df, ['OPP_NAME', 'OPP_NICKNAME'], 'OPP_ID')
combatants_df = hashify(combatants_df, ['F_NAME', 'NICKNAME'], 'FIGHTER_ID')
ufcfighters = hashify(ufc_fighters, ['FIGHTER_NAME', 'NICKNAME'], 'FIGHTER_ID')

# bouts_df = hashify(bouts_df, ['F'], 'FIGHTER_ID')
# bouts_df = hashify(bouts_df, ['OPP_NAME'], 'OPP_ID')
# combatants_df = hashify(combatants_df, ['F_NAME'], 'FIGHTER_ID')
# ufc_fighters = hashify(ufc_fighters, ['FIGHTER_NAME'], 'FIGHTER_ID')

# ufc_fighters.drop(['NICKNAME'], axis=1, inplace=True)

ufc_combatants = replace_na(ufc_fighters.merge(combatants_df, how='outer', on='FIGHTER_ID', suffixes=['L','R'])).replace('', None)
ufc_combatants_check = replace_na(ufc_fighters.merge(combatants_df, how='outer', left_on=['FIGHTER_NAME', 'NICKNAME'], right_on=['F_NAME', 'NICKNAME'], suffixes=['L','R'])).replace('', None)

# ufc_combatants = prioritize_columns(combatants_df, list(combatants_df.select_dtypes('object').columns))

In [157]:
ufc_combatants['FIGHTER_ID'].value_counts().sort_values(ascending=False)
# cols = combatants_df.columns.drop(['FIGHTER_ID'])
# ufc_combatants_check[ufc_combatants_check['FIGHTER_NAME'].isnull()][cols]

7f459acc7f6a0d33b67e1ec82fff517e829605d9545761c03b3e5a2810501734    2
1e4872b949b81108e5b5173422b8b073cd7d7d061d9aef6780dec86e3a5f2bcd    2
0500c2bb664835b4cd9b7c27e9d1742e2aec1500d5215622580a1be4da4597fe    1
18457b3667e74a4316944207bbceee49f20c7491752380f0bbce76cac4c8c91d    1
bd423cc9f06d41495df04c890bbf658e30f373f1080b7c77af127034f468f794    1
                                                                   ..
46add4552f5446e208896bf35ae472525c88f59edae5d6e62144f4e3af51c0dc    1
32e3a06185dc4470d48b82f543eaa9280198a9c8650669e4f1898d854905eee8    1
c92c955bb9af7af916667c9f2dd4591d0bbc6241d48d9165384ad4989b78f109    1
7cc1ba0ebcef769370c2dcd363bae64d9dc5079b9a833d2c6eec746dea2471ac    1
b03e3c304b39d86a5c11cc662532a568bf91743bca2960300909da6132d9829e    1
Name: FIGHTER_ID, Length: 4612, dtype: int64

In [158]:
_ = bouts_df.groupby('FIGHTER_ID')[['SUBMISSION', 'F_SUB_ATT']].sum()

_.loc[(_['SUBMISSION'] == 0), 'SUBMISSION'] = .1
_.loc[_['F_SUB_ATT'] < _['SUBMISSION'], 'F_SUB_ATT'] = _['SUBMISSION']
_

_['SUBMISSION_ACC'] = (_['SUBMISSION'] / _['F_SUB_ATT']).fillna(0).replace(0, .0001)

_
_.drop(['SUBMISSION', 'F_SUB_ATT'], axis=1, inplace=True)
map_dict = _.to_dict('series')['SUBMISSION_ACC']
map_dict


FIGHTER_ID
000233c69e6446e10b70c4b3f566e744deca19a6481da0e9942a1340cb5718b1    1.000000
0020b82b95202864a9e6599833657c80d50a9b6d03f92c5671af4dc255e48f72    0.016667
002d925580faf2ace1ef04803b3ea66c8ee74571af4a1ae56c80766590a0ee0e    1.000000
0032e42f1794805e8994a08c3ed5788ce38204f03119a6b39ec6de435cf4d7da    0.238095
00561cd6a1ea5db17556fe9289728aa7eea1b7bc8bed584fb1328eac47af538f    1.000000
                                                                      ...   
ff7eb443c28b589e823493b625bc3f062b7f49e7c84d1c52e6624dd8acab451e    1.000000
ff9da183c35c151d39b46a0de3d69351665d002830f939dc3db0d9bcea5f7aac    0.050000
ffbb5af26932742f86577bc0449bd2f4a87576fb6244110d530d7f5013f90d99    0.500000
ffbd5f271276db6ce7fd7863046e72f0735f0edad655f22e1c9aef4d44c3272d    1.000000
ffcc65cc9c9b0ef8c76f03aeaa1f657865127600a9044b13ded7e623dc1d70b5    1.000000
Name: SUBMISSION_ACC, Length: 2161, dtype: float64

In [159]:
import numpy as np
opp_sub_defense = bouts_df.loc[:,['OPP_ID', 'SUBMISSION', 'F_SUB_ATT', 'WINNER']]
opp_sub_defense['OPP_SUB_DEFENSE'] = (1 - opp_sub_defense['SUBMISSION'] / opp_sub_defense['F_SUB_ATT'])
opp_sub_defense.loc[opp_sub_defense['OPP_SUB_DEFENSE'] == -np.inf, 'OPP_SUB_DEFENSE'] = 0
opp_sub_defense = opp_sub_defense.dropna().groupby('OPP_ID', as_index=True)['OPP_SUB_DEFENSE'].mean()
ufc_combatants['SUBMISSION_DEFENSE'] = ufc_combatants['FIGHTER_ID'].map(opp_sub_defense)

In [160]:
bouts_df['CTRL_RATIO'] = bouts_df['F_CTRL'] / bouts_df['DURATION']
map_dict = bouts_df.groupby('FIGHTER_ID')['CTRL_RATIO'].mean()
ufc_combatants['CTRL_RATIO'] = ufc_combatants['FIGHTER_ID'].map(map_dict).fillna(0)

In [161]:
# db = DBInterface(cfg.DB_PATH / 'ufc_raw')
# df = db.Pdf('fighters')
# db.close()
# df['Fighter_Name']

# dff = df['Fighter_Name'].str.split(' ', expand=True)

# dff[dff[4].notnull()]

In [162]:
# db = DBInterface(cfg.DB_PATH / 'ufc_silver')
# df = db.Pdf('combatants')
# db.close()

# df = df.loc[~df['DIVISION_RANK'].isin([None])]
# df.sort_values(['GENDER', 'DIVISION', 'DIVISION_RANK'])[['FIGHTER_NAME', 'DIVISION', 'DIVISION_RANK', 'GENDER']]

In [163]:
drop_words = [
        "junior",
        "de",
        "dos",
        "da",
        "jr.",
        "jr",
        "júnior",
        "júnior",
        None,
        "del",
        "van",
        "von",
]

cleaned = ufc_combatants.loc[:,'F_NAME'].str.split().apply(lambda x: [x for x in x if x.lower() not in drop_words])

last_names = cleaned.apply(lambda x: x[-1]).str.lower()

ufc_combatants.loc[:,'LAST_NAME'] = last_names
ufc_combatants
# cleaned = (df["F_NAME"].str.split().apply(lambda x: [x for x in x if x.lower() not in drop_words]))
# last_names = cleaned.apply(lambda x: x[-1]).str.lower()

TypeError: 'NoneType' object is not iterable

In [None]:

db = DBInterface(cfg.DB_PATH / 'ufc_silver')
db.to_table(ufc_combatants, 'combatants')
db.to_table(bouts_df, 'bouts_df')
db.close()