In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

# Load
df = pd.read_csv('/content/drive/MyDrive/data/mma/masterdataframe.csv')
df['date'] = pd.to_datetime(df['date'])
df.head(3)

Unnamed: 0,date,fight_url,event_url,result,fighter,opponent,division,stance,dob,method,...,recent_avg_clinch_strikes_attempts_per_min,precomp_recent_avg_clinch_strikes_attempts_per_min,avg_ground_strikes_landed_per_min,precomp_avg_ground_strikes_landed_per_min,recent_avg_ground_strikes_landed_per_min,precomp_recent_avg_ground_strikes_landed_per_min,avg_ground_strikes_attempts_per_min,precomp_avg_ground_strikes_attempts_per_min,recent_avg_ground_strikes_attempts_per_min,precomp_recent_avg_ground_strikes_attempts_per_min
0,1994-03-11,http://ufcstats.com/fight-details/4acab67848e7...,http://ufcstats.com/event-details/a6a9ab5a824e...,0,Sean Daugherty,Scott Morris,Open Weight,,1975-12-04,SUB,...,,,0.0,,,,0.0,,,
1,1994-03-11,http://ufcstats.com/fight-details/4acab67848e7...,http://ufcstats.com/event-details/a6a9ab5a824e...,1,Scott Morris,Sean Daugherty,Open Weight,Orthodox,,SUB,...,,,0.0,,,,0.0,,,
2,1994-03-11,http://ufcstats.com/fight-details/4b9ae533ccb3...,http://ufcstats.com/event-details/a6a9ab5a824e...,0,Ray Wizard,Patrick Smith,Open Weight,,,SUB,...,,,0.0,,,,0.0,,,


## Fighting style meta over the years?

In [5]:
# Resample table by yearly mean, and only keep starting at 2000.
# Copy the dataframe
df_yearly = df.copy()

df_yearly['sub_win'] = df_yearly['method'].apply(lambda x: 1 if x == 'SUB' else 0)

# One-hot encode the 'method' column
df_yearly['method'] = df_yearly['method'].apply(lambda x: 'DEC' if x.endswith('DEC') else x)
df_yearly = pd.concat([df_yearly, pd.get_dummies(df_yearly['method'], prefix='method')], axis=1)
df_yearly = df_yearly.drop(columns=['method'])

df_yearly = df_yearly.resample('Y', on='date').mean()

df_yearly = df_yearly.iloc[6:] # only keep starting at 2000.
df_yearly.reset_index(inplace=True)


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [41]:
# Resample by year and count unique 'fight_url'
df_fights_per_year = df.resample('Y', on='date')['fight_url'].nunique()

# Reset index
df_fights_per_year = df_fights_per_year[:-1].reset_index()

# Plotting
fig = go.Figure(data=[
    go.Bar(
        x=df_fights_per_year['date'],
        y=df_fights_per_year['fight_url'],
        hovertemplate = 'Year: %{x|%Y} <br>Fights: %{y}'
    )
])

fig.update_layout(
    title="Number of Fights Per Year",
    xaxis=dict(
        type='date',
        title='Year'
    ),
    yaxis=dict(
        type='linear',
        title='Number of Fights'
    ),
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'fights_per_year.html', full_html=False)

###Overall match similarity:
PCA analysis of matches over time to see if they've changed?



In [None]:
# Grouping and filling NaN values
df_grouped = df.dropna(subset=['date'])
df_grouped = df_grouped[df_grouped['date'].dt.year >= 2000].groupby(df_grouped['date'].dt.year)
start_date = pd.to_datetime("01/01/2000", format='%d/%m/%Y')

df_pca = df_grouped.transform(lambda x: x.fillna(x.median()))

if df_pca.isnull().values.any():
    df_pca = df_pca.fillna(df_pca.median())

df_pca['days_from_start'] = (df_pca['date'] - start_date).dt.days
days_from_start = df_pca['days_from_start']
data = df_pca.select_dtypes(include="number").drop(columns=['days_from_start'])

# Preprocess data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Perform PCA
pca = PCA(n_components=3)
pca_result = pca.fit_transform(data_scaled)

# PCA results to DataFrame
df_pca_results = pd.DataFrame(pca_result, columns=['PC1', 'PC2', 'PC3'])
df_pca_results['days_from_start'] = days_from_start  # add the 'DaysFromStart' column to the dataframe
df_pca_results = df_pca_results.dropna(subset=['days_from_start'])


Dropping invalid columns in DataFrameGroupBy.transform is deprecated. In a future version, a TypeError will be raised. Before calling .transform, select only columns which should be valid for the function.


Dropping invalid columns in DataFrameGroupBy.transform is deprecated. In a future version, a TypeError will be raised. Before calling .transform, select only columns which should be valid for the function.


Dropping invalid columns in DataFrameGroupBy.transform is deprecated. In a future version, a TypeError will be raised. Before calling .transform, select only columns which should be valid for the function.


Mean of empty slice


Mean of empty slice


Mean of empty slice


DataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.conca

In [None]:
fig = px.scatter_3d(df_pca_results, x='PC1', y='PC2', z='PC3', color='days_from_start', color_continuous_scale='Viridis', title="3D PCA of MMA Match Data, over Time")
fig.update_layout(scene_camera=dict(eye=dict(x=-0.5, y=-1.25, z=0.5)))
fig.show()
pio.write_html(fig, 'pca_scatter_3d.html', full_html=False)

In [None]:
fig = px.scatter(df_pca_results, x='PC1', y='PC2', color='days_from_start', color_continuous_scale='Viridis', title="2D PCA of MMA Match Data, over Time")
fig.show()
pio.write_html(fig, 'pca_scatter_2d.html', full_html=False)

In [None]:
fig = px.scatter(df_pca_results, x='PC1', y='days_from_start', color='days_from_start', color_continuous_scale='Viridis', title="1D PCA of MMA Match Data, over Time")
fig.show()
pio.write_html(fig, 'pca_scatter_1d.html', full_html=False)

### Standing game:
High volume of strikes. Much lowered accuracy but still leads to total increase of landing strikes.

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['total_strikes_attempts_per_min'],
    mode='lines+markers',
    name='Attempts',
    line=dict(shape='spline', color='red'),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['total_strikes_landed_per_min']/df_yearly['total_strikes_attempts_per_min']*100,
    mode='lines+markers',
    name='Accuracy',
    line=dict(shape='spline', color='green', width=0.9, dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}',
    yaxis='y2'
))

fig.update_layout(
    yaxis2=dict(
        title='Accuracy (%)',
        overlaying='y',
        side='right'
    ),
    title="Strikes per Minute, over Time",
    xaxis_title="Year",
    yaxis_title="Strikes per Minute",
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['takedowns_attempts_per_min'],
    mode='lines+markers',
    name='Attempts',
    line=dict(shape='spline', color='blue'),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['takedowns_landed_per_min']/df_yearly['takedowns_attempts_per_min']*100,
    mode='lines+markers',
    name='Accuracy',
    line=dict(shape='spline', color='green', width=0.9, dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}',
    yaxis='y2'
))

fig.update_layout(
    yaxis2=dict(
        title='Accuracy (%)',
        overlaying='y',
        side='right'
    ),
    title="Takedowns per Minute, over Time",
    xaxis_title="Year",
    yaxis_title="Takedowns per Minute",
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'tkds_per_min_over_time_scatter.html', full_html=False)

#### Striking nuances

What moves have fighters started to favor?

##### Distance

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['distance_strikes_attempts'],
    hoverinfo='x+y',
    mode='none',
    stackgroup='one',
    groupnorm='percent',  # sets the normalization for the sum of the stackgroup
    fillcolor="#a7d5ed",
    name='Distance Strikes',
    hovertemplate = 'Year %{x|%Y}: %{y}'
))
fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['clinch_strikes_attempts'],
    hoverinfo='x+y',
    mode='none',
    stackgroup='one',
    fillcolor="#de6e56",
    name='Clinch Strikes',
    hovertemplate = 'Year %{x|%Y}: %{y}'
))


fig.update_layout(
    title="Proportions of Attempted Strikes Distances, over Time",
    showlegend=True,
    xaxis=dict(
        type='date',
        title='Year'
    ),
    yaxis=dict(
        type='linear',
        range=[1, 100],
        ticksuffix='%'
    ),
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'proportions_strike_distance_over_time.html', full_html=False)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=(df_yearly['clinch_strikes_landed']/df_yearly['clinch_strikes_attempts']*100),
    mode='lines+markers',
    name='Clinch Strike',
    line=dict(shape='spline', color="#de6e56", dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=(df_yearly['distance_strikes_landed']/df_yearly['distance_strikes_attempts']*100),
    mode='lines+markers',
    name='Distance Strike',
    line=dict(shape='spline', color="#a7d5ed", dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.update_layout(
    title="Strike Accuracy by Distance, over Time",
    xaxis_title="Year",
    yaxis_title="Accuracy (%)",
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'strike_acc_by_distance_over_time.html', full_html=False)

##### Target

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['leg_strikes_attempts'],
    hoverinfo='x+y',
    mode='none',
    stackgroup='one',
    fillcolor="#466964",
    groupnorm='percent',  # sets the normalization for the sum of the stackgroup
    name='Leg Strikes',
    hovertemplate = 'Year %{x|%Y}: %{y}'
))
fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['body_strikes_attempts'],
    hoverinfo='x+y',
    mode='none',
    stackgroup='one',
    fillcolor="#76c68f",
    name='Body Strikes',
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['head_strikes_attempts'],
    hoverinfo='x+y',
    mode='none',
    stackgroup='one',
    fillcolor="#c9e52f",
    name='Head Strikes',
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.update_layout(
    title="Proportions of Attempted Strikes Targets, over Time",
    showlegend=True,
    xaxis=dict(
        type='date',
        title='Year'
    ),
    yaxis=dict(
        type='linear',
        range=[1, 100],
        ticksuffix='%'
    ),
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'proportions_strike_targets_over_time.html', full_html=False)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=(df_yearly['head_strikes_landed']/df_yearly['head_strikes_attempts']*100),
    mode='lines+markers',
    name='Head Strikes',
    line=dict(shape='spline', color="#c9e52f", dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=(df_yearly['body_strikes_landed']/df_yearly['body_strikes_attempts']*100),
    mode='lines+markers',
    name='Body Strikes',
    line=dict(shape='spline', color="#76c68f", dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=(df_yearly['leg_strikes_landed']/df_yearly['leg_strikes_attempts']*100),
    mode='lines+markers',
    name='Leg Strikes',
    line=dict(shape='spline', color="#466964", dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.update_layout(
    title="Strike Accuracy by Target, over Time",
    xaxis_title="Year",
    yaxis_title="Accuracy (%)",
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'strike_acc_by_target_over_time.html', full_html=False)

##### Significance
Strikes are also classified based on whether they are intended to be significant strikes, as opposed to low commital strikes.

In [None]:
df_yearly['minimal_strikes_attempts'] = df_yearly['total_strikes_attempts'] - df_yearly['sig_strikes_attempts']
df_yearly['minimal_strikes_missed'] = df_yearly['minimal_strikes_attempts'] - (df_yearly['total_strikes_landed'] - df_yearly['sig_strikes_landed'])
df_yearly['minimal_strikes_landed'] = df_yearly['minimal_strikes_attempts'] - df_yearly['minimal_strikes_missed']
df_yearly['minimal_strikes_accuracy'] = df_yearly['minimal_strikes_landed']/ df_yearly['minimal_strikes_attempts']*100

df_yearly['sig_strikes_accuracy'] = df_yearly['sig_strikes_landed']/ df_yearly['sig_strikes_attempts']*100

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['sig_strikes_attempts'],
    hoverinfo='x+y',
    mode='none',
    stackgroup='one',
    fillcolor="#ff6361",
    groupnorm='percent',  # sets the normalization for the sum of the stackgroup
    name='Significant Strikes',
    hovertemplate = 'Year %{x|%Y}: %{y}'
))
fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['minimal_strikes_attempts'],
    hoverinfo='x+y',
    mode='none',
    stackgroup='one',
    fillcolor="#ffa600",
    name='Minimal Strikes',
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.update_layout(
    title="Proportions of Attempted Strikes Significance, over Time",
    showlegend=True,
    xaxis=dict(
        type='date',
        title='Year'
    ),
    yaxis=dict(
        type='linear',
        range=[1, 100],
        ticksuffix='%'
    ),
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'proportions_strike_sig_over_time.html', full_html=False)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=(df_yearly["minimal_strikes_accuracy"]),
    mode='lines+markers',
    name='Minimal Strikes',
    line=dict(shape='spline', color="#ffa600", dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=(df_yearly['sig_strikes_accuracy']),
    mode='lines+markers',
    name='Significant Strikes',
    line=dict(shape='spline', color="#ff6361", dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.update_layout(
    title="Strike Accuracy by Significance, over Time",
    xaxis_title="Year",
    yaxis_title="Accuracy (%)",
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'strike_acc_by_sig_over_time.html', full_html=False)

### Ground game:

In [36]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['ground_strikes_attempts']/df_yearly['control']*60,
    mode='lines+markers',
    name='Attempts',
    line=dict(shape='spline', color='red'),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['ground_strikes_landed']/df_yearly['ground_strikes_attempts']*100,
    mode='lines+markers',
    name='Accuracy',
    line=dict(shape='spline', color='green', width=0.7, dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}',
    yaxis='y2'
))

fig.update_layout(
    yaxis2=dict(
        title='Accuracy (%)',
        overlaying='y',
        side='right'
    ),
    title="Ground Strike Frequency on the Ground, over Time",
    xaxis_title="Year",
    yaxis_title="Ground Strike Frequency (per min)",
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'ground_strikes_per_down_over_time.html', full_html=False)

In [30]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['sub_attempts']/df_yearly['control']*60,
    mode='lines+markers',
    name='Attempt Frequency',
    line=dict(shape='spline', color='blue'),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['sub_win']/(df_yearly['sub_attempts'])*100,
    mode='lines+markers',
    name='Accuracy',
    line=dict(shape='spline', color='green', width=0.7, dash="dash"),
    hovertemplate = 'Year %{x|%Y}: %{y}',
    yaxis='y2'
))

fig.update_layout(
    yaxis2=dict(
        title='Accuracy (%)',
        overlaying='y',
        side='right',
        range=[20, 62]
    ),
    title="Submission Frequency on the Ground, over Time",

    xaxis_title="Year",
    yaxis_title="Submission Attempt Frequency (per min)",
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'sub_per_down_over_time.html', full_html=False)

In [37]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['control']/(df_yearly['takedowns_landed']+df_yearly['knockdowns']),
    mode='lines+markers',
    name='Time (sec)',
    line=dict(shape='spline', color='darkblue'),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.update_layout(
    title="Ground Control Duration per Down, over Time",
    xaxis_title="Year",
    yaxis_title="Ground Control Duration per Down (sec/down)",
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'ground_control_dur_per_down_over_time.html', full_html=False)

In [20]:
fig = go.Figure()


fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['ground_strikes_attempts']/df_yearly['control'],
    mode='lines+markers',
    name='Ground Strike Attempts',
    line=dict(shape='spline', color='red'),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['sub_attempts']/df_yearly['control'],
    mode='lines+markers',
    name='Submission Attempts',
    line=dict(shape='spline', color='blue'),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.update_layout(
    title="Log Frequency of Attacks while on Ground, over Time",
    xaxis_title="Year",
    yaxis_title="Attacks per Ground Control Duration (/sec)",
    autosize=False,
    yaxis_type="log",
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'log_freq_attacks_while_on_ground_over_time.html', full_html=False)

## How did this affect match characteristics?

Longer fights and more even. More decisions

In [35]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['total_comp_time'],
    mode='lines+markers',
    name='Competition Time (sec)',
    line=dict(shape='spline', color="darkblue"),
    hovertemplate = 'Year %{x|%Y}: %{y}'
))

fig.add_trace(go.Scatter(
    x=df_yearly['date'],
    y=df_yearly['total_strikes_landed_differential'],
    mode='lines+markers',
    name='Strikes Landed Differential',
    line=dict(shape='spline'),
    hovertemplate = 'Year %{x|%Y}: %{y}',
    yaxis='y2'
))


fig.update_layout(
    yaxis2=dict(
        title='Strikes Landed Differential',
        overlaying='y',
        side='right'
    ),
    title="Fight Duration and Strikes Landed Differentials, over Time",
    xaxis_title="Year",
    yaxis_title="Average Total Competition Time (sec)",
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'fight_dur_and_strikes_landed_differentials_over_time.html', full_html=False)

In [None]:
from pandas import get_dummies

fig = go.Figure()
colors = ["#ff6361", "#ffa600", "#bc5090", "#58508d", "#003f5c", "#33cc33", "#003399"]  # 7 colors for 7 methods

method_columns = [col for col in df_yearly.columns if col.startswith('method_')]

for i, method in enumerate(method_columns):
    fig.add_trace(go.Scatter(
        x=df_yearly['date'],
        y=df_yearly[method]*100,
        mode='none',
        stackgroup='one',
        fillcolor=colors[i],
        name=method.replace('method_', ''),  # Remove the 'method_' prefix in the legend
        hoverinfo='name+y',  # Display the method name and y value
        hovertemplate = '%{y:.2f}%',  # Display the y value as a percentage with 2 decimal places

    ))

fig.update_layout(
    title="Victory Methods Over Time",
    showlegend=True,
    xaxis=dict(
        type='date',
        title='Year'
    ),
    yaxis=dict(
        type='linear',
        range=[1, 100],
        ticksuffix='%'
    ),
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'victory_methods_over_time.html', full_html=False)

In [None]:
TIME

## MMA Predictive Model

### Preparing dataset

In [None]:
import pandas as pd

# Features to aggregate over all fighter matches.
numeric_features = ["knockdowns", "sub_attempts", "reversals", "control", "takedowns_landed",
"takedowns_attempts", "sig_strikes_landed", "sig_strikes_attempts", "total_strikes_landed",
"total_strikes_attempts", "head_strikes_landed", "head_strikes_attempts", "body_strikes_landed",
"body_strikes_attempts", "leg_strikes_landed", "leg_strikes_attempts", "distance_strikes_landed",
"distance_strikes_attempts", "clinch_strikes_landed", "clinch_strikes_attempts",
"ground_strikes_landed", "ground_strikes_attempts", "takedowns_accuracy", "sig_strikes_accuracy",
"total_strikes_accuracy", "head_strikes_accuracy", "body_strikes_accuracy", "leg_strikes_accuracy",
"distance_strikes_accuracy", "clinch_strikes_accuracy", "ground_strikes_accuracy", "takedowns_def",
"sig_strikes_def", "total_strikes_def", "head_strikes_def", "body_strikes_def", "leg_strikes_def",
"distance_strikes_def", "clinch_strikes_def", "ground_strikes_def", "reach_differential",
"height_differential", "age_differential", "knockdowns_differential", "sub_attempts_differential",
"reversals_differential", "control_differential", "takedowns_landed_differential",
"takedowns_attempts_differential", "sig_strikes_landed_differential",
"sig_strikes_attempts_differential", "total_strikes_landed_differential",
"total_strikes_attempts_differential", "head_strikes_landed_differential",
"head_strikes_attempts_differential", "body_strikes_landed_differential",
"body_strikes_attempts_differential", "leg_strikes_landed_differential",
"leg_strikes_attempts_differential", "distance_strikes_landed_differential",
"distance_strikes_attempts_differential", "clinch_strikes_landed_differential",
"clinch_strikes_attempts_differential", "ground_strikes_landed_differential",
"ground_strikes_attempts_differential", "takedowns_accuracy_differential",
"sig_strikes_accuracy_differential", "total_strikes_accuracy_differential",
"head_strikes_accuracy_differential", "body_strikes_accuracy_differential",
"leg_strikes_accuracy_differential", "distance_strikes_accuracy_differential",
"clinch_strikes_accuracy_differential", "ground_strikes_accuracy_differential",
"takedowns_def_differential", "sig_strikes_def_differential",
"total_strikes_def_differential", "head_strikes_def_differential",
"body_strikes_def_differential", "leg_strikes_def_differential",
"distance_strikes_def_differential", "clinch_strikes_def_differential",
"ground_strikes_def_differential"]

columns = ['fight_url', 'fighter_url', 'fighter_name', 'fight_count', 'win_ratio', 'reach', 'height', 'age', 'win_streak', 'max_win_streak'] +\
          [f'mean_{feature}' for feature in numeric_features]
fighter_df = pd.DataFrame(columns=columns)
fighter_dict = {}

for idx, row in df.iterrows():
    fighter_url = row['fighter_url']

    if fighter_url not in fighter_dict:
        # Initialize stats for new fighter
        fighter_dict[fighter_url] = {'win_count': 0, 'fight_count': 0, 'win_streak': 0, 'max_win_streak': 0}
        for feature in numeric_features:
            fighter_dict[fighter_url][f'sum_{feature}'] = 0

    # Update sum of numeric features
    for feature in numeric_features:
        fighter_dict[fighter_url][f'sum_{feature}'] += row[feature]

    # Calculate win_ratio and mean of numeric features
    if fighter_dict[fighter_url]['fight_count'] > 0:
        win_ratio = fighter_dict[fighter_url]['win_count'] / fighter_dict[fighter_url]['fight_count']
    else:
        win_ratio = 0
    mean_features = {f'mean_{feature}': fighter_dict[fighter_url][f'sum_{feature}'] / max(fighter_dict[fighter_url]['fight_count'], 1) for feature in numeric_features}

    # Append row to fighter_df
    new_row = pd.Series({
        'date': row['date'],
        'fight_url': row['fight_url'],
        'fighter_url': fighter_url,
        'fighter_name': row['fighter'],
        'fight_count': fighter_dict[fighter_url]['fight_count'],
        'win_ratio': win_ratio,
        'reach': row['reach'],
        'height': row['height'],
        'age': row['age'],
        'win_streak': fighter_dict[fighter_url]['win_streak'],
        'max_win_streak': fighter_dict[fighter_url]['max_win_streak'],
        **mean_features
    })

    fighter_df = pd.concat([fighter_df, new_row.to_frame().T], ignore_index=True)

    # Update fighter dict
    fighter_dict[fighter_url]['fight_count'] += 1
    if row['result'] == 1:
        fighter_dict[fighter_url]['win_count'] += 1
        fighter_dict[fighter_url]['win_streak'] += 1
        fighter_dict[fighter_url]['max_win_streak'] = max(fighter_dict[fighter_url]['max_win_streak'], fighter_dict[fighter_url]['win_streak'])
    else:
        fighter_dict[fighter_url]['win_streak'] = 0

In [None]:
predict_data = []

for _, row in df.iterrows():
    fighter_data = fighter_df[(fighter_df['fighter_url'] == row['fighter_url']) & (fighter_df['fight_url'] == row['fight_url'])]
    opponent_data = fighter_df[(fighter_df['fighter_url'] == row['opponent_url']) & (fighter_df['fight_url'] == row['fight_url'])]

    if not fighter_data.empty and not opponent_data.empty:

      fighter_data = fighter_data.drop(columns=['fighter_url', 'fighter_name'])
      opponent_data = opponent_data.drop(columns=['fighter_url', 'fighter_name'])

     # Calculate age and reach differentials
      fighter_data['age_differential'] = fighter_data['age'] - opponent_data['age'].values[0]
      fighter_data['reach_differential'] = fighter_data['reach'] - opponent_data['reach'].values[0]

      # Append prefix to fighter and opponent column names
      fighter_data.columns = ['fighter_' + col for col in fighter_data.columns]
      opponent_data.columns = ['opponent_' + col for col in opponent_data.columns]

      new_row = pd.concat([pd.Series(row['result'], index=['result']), fighter_data.iloc[0], opponent_data.iloc[0]])

      predict_data.append(new_row)

predict_df = pd.DataFrame(predict_data).reset_index(drop=True)

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

TARGET_FEATURE = "result"
from sklearn.base import TransformerMixin

class MedianHeightImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.median_height_fighter = X["fighter_height"].median()
        self.median_height_opponent = X["opponent_height"].median()
        return self

    def transform(self, X):
        X["fighter_height"] = X["fighter_height"].fillna(self.median_height_fighter)
        X["opponent_height"] = X["opponent_height"].fillna(self.median_height_opponent)
        return X

class ReachImputer(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["fighter_reach"] = X["fighter_reach"].fillna(X["fighter_height"])
        X["opponent_reach"] = X["opponent_reach"].fillna(X["opponent_height"])
        return X

class MedianImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.median_values = X.median()
        return self

    def transform(self, X):
        return X.fillna(self.median_values)

np.random.seed(0)  # Set seed for reproducibility

sorted_urls = predict_df.sort_values('fighter_date')['fighter_fight_url']

split_idx1 = int(len(sorted_urls) * 0.9)  # Data for train
split_idx2 = split_idx1 + int(len(sorted_urls) * 0.05)  # Data for val, rest for test

train_urls = sorted_urls[:split_idx1]
val_urls = sorted_urls[split_idx1:split_idx2]
test_urls = sorted_urls[split_idx2:]

print('Total samples:', len(sorted_urls))
print('Training set size:', len(train_urls))
print('Validation set size:', len(val_urls))
print('Test set size:', len(test_urls))

train_data = predict_df[predict_df['fighter_fight_url'].isin(train_urls)]
val_data = predict_df[predict_df['fighter_fight_url'].isin(val_urls)]
test_data = predict_df[predict_df['fighter_fight_url'].isin(test_urls)]

train_data = train_data.drop(columns=['fighter_fight_url', 'opponent_fight_url', 'fighter_date', 'fighter_date', 'opponent_date'])
val_data = val_data.drop(columns=['fighter_fight_url', 'opponent_fight_url', 'fighter_date', 'opponent_date'])
test_data = test_data.drop(columns=['fighter_fight_url', 'opponent_fight_url', 'fighter_date', 'opponent_date'])

X_train = train_data.drop(TARGET_FEATURE, axis=1)
y_train = train_data[TARGET_FEATURE]

X_val = val_data.drop(TARGET_FEATURE, axis=1)
y_val = val_data[TARGET_FEATURE]

X_test = test_data.drop(TARGET_FEATURE, axis=1)
y_test = test_data[TARGET_FEATURE]

pipeline = Pipeline([
    ('median_height_imputer', MedianHeightImputer()),
    ('reach_imputer', ReachImputer()),
    ('remaining_imputer', MedianImputer()),
    ("scaler", StandardScaler())
])

X_train_transformed = pipeline.fit_transform(X_train)
X_val_transformed = pipeline.transform(X_val)
X_test_transformed = pipeline.transform(X_test)

Total samples: 13322
Training set size: 11989
Validation set size: 666
Test set size: 667


###Train

In [None]:
#hyperparameter tuning
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

param_grid = {
    'max_depth': [4, 6, 8],
    'n_estimators': [100, 200, 300,],
    'learning_rate': [0.01, 0.05, 0.1]
}

params_fixed = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': 42
}

# Train the XGBoost model with GridSearchCV
xgb_model = xgb.XGBClassifier(**params_fixed, early_stopping_rounds=10, tree_method='gpu_hist')
grid_search = GridSearchCV(xgb_model, param_grid, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_transformed, y_train, eval_set=[(X_val_transformed, y_val)], verbose=0)

# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters: ", grid_search.best_params_)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print("Accuracy: %.2f%%, Precision: %.2f%%, Recall: %.2f%%, F1: %.2f%%" % (\
      accuracy*100.0, precision*100.0, recall*100.0, f1*100.0))

best_params_ = params_fixed | grid_search.best_params_


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Best Hyperparameters:  {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}
Accuracy: 72.84%, Precision: 73.11%, Recall: 72.24%, F1: 72.67%


In [44]:
# Calculate cumulative sum of fights
df_fights_per_year['cumulative_fights'] = df_fights_per_year['fight_url'].cumsum()

# Total fights
total_fights = df_fights_per_year['fight_url'].sum()

# Calculate the year where first 90% of matches happen
cutoff_year = df_fights_per_year[df_fights_per_year['cumulative_fights'] <= total_fights * 0.9]['date'].max()

# Plotting
fig = go.Figure()

# Add bar chart for number of fights
fig.add_trace(go.Bar(
    x=df_fights_per_year['date'],
    y=df_fights_per_year['fight_url'],
    name='Number of Fights',
    hovertemplate = 'Year: %{x|%Y} <br>Fights: %{y}'
))

# Add line for 90% cutoff
fig.add_trace(go.Scatter(
    x=[cutoff_year, cutoff_year],
    y=[0, df_fights_per_year['fight_url'].max()],
    mode='lines',
    name='90% Cutoff',
    line=dict(
        color='red',
        dash='dash'
    )
))

fig.update_layout(
    title="Training Set Cutoff",
    xaxis=dict(
        type='date',
        title='Year'
    ),
    yaxis=dict(
        type='linear',
        title='Number of Fights'
    ),
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'training_set_cutoff.html', full_html=False)

###Feature selection

In [None]:

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import plotly.graph_objects as go

fs = SelectKBest(score_func=f_classif, k='all')  # use 'all' to fit on all features
fs.fit(X_train_transformed, y_train)

feature_scores = fs.scores_

feature_scores_df = pd.DataFrame({'Feature': X_train.columns, 'Score': feature_scores})

feature_scores_df = feature_scores_df.sort_values('Score', ascending=False)


fig = go.Figure(data=[
    go.Bar(
        x=feature_scores_df['Feature'],
        y=feature_scores_df['Score'],
        marker_color='rgb(55, 83, 109)'
    )
])

fig.update_layout(
    title="Top Features Based on ANOVA F-Score",
    xaxis=dict(
        title='Feature',
        showticklabels=False,
    ),
    yaxis=dict(
        title='F-Score',
        type='linear'
    ),
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'top_features_ANOVA_f.html', full_html=False)

In [None]:
# use feature importance for feature selection. We'll select model with maximum precision for context of betting.

from numpy import sort

# Fit best model using each importance as a threshold, and keep best precision model.
thresholds = sort(best_model.feature_importances_)
df_thresholds = pd.DataFrame(columns=['n', 'threshold', 'accuracy', 'precision', 'recall', 'f1'])
best_precision_model = best_model
best_precision = precision
best_precision_model_selection = None
for thresh in thresholds:
  # select features using threshold
  selection = SelectFromModel(best_model, threshold=thresh, prefit=True)
  select_X_train = selection.transform(X_train_transformed)
  select_X_val = selection.transform(X_val_transformed)

  # train
  selection_model = xgb.XGBClassifier(**best_params_, early_stopping_rounds=10, tree_method='gpu_hist')
  selection_model.fit(select_X_train, y_train, eval_set=[(select_X_val, y_val)], verbose=0)

  # eval
  select_X_test = selection.transform(X_test_transformed)
  y_pred = selection_model.predict(select_X_test)

  #fill table
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='binary')
  recall = recall_score(y_test, y_pred, average='binary')
  f1 = f1_score(y_test, y_pred, average='binary')

  n = select_X_train.shape[1]
  new_row = pd.Series({"n": int(n),	"threshold": thresh,	"accuracy": accuracy,	"precision": precision,	"recall": recall,	"f1": f1,})
  df_thresholds = pd.concat([df_thresholds, new_row.to_frame().T], ignore_index=True)

  #keep best model (precision)
  if precision>best_precision:
    best_precision_model = selection_model
    best_precision = precision
    best_precision_model_selection = selection

df_thresholds['n'] = df_thresholds['n'].astype(int)

In [None]:
##Selected Model's Feature threshold

row = df_thresholds.loc[df_thresholds['precision'].idxmax()]
selected_n = row['n']
print(row)

feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model.feature_importances_,
}).sort_values('Importance', ascending=False)
max_y = feature_importance_df['Importance'].max()

# Best feature selection threshold?
fig = go.Figure(data=[
    go.Bar(
        x=feature_importance_df['Feature'],
        y=feature_importance_df['Importance'],
        marker_color='rgb(55, 83, 109)',
        name="Feature Importance"
    )
])

line_settings = dict(color="Red", width=1, dash="dot")
layout_settings = dict(
    title="Feature Importances, and Threshold for Precision Model",
    xaxis=dict(showticklabels=False),
    yaxis=dict(title='Importance', type='linear'),
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white',
    shapes=[dict(type="line", x0=selected_n, y0=0, x1=selected_n, y1=max_y, line=line_settings)]
)

fig.update_layout(**layout_settings)
fig.add_trace(go.Scatter(x=[selected_n], y=[max_y], mode="lines", name="Precision Model Feature Threshold", line=dict(color="Red", width=1, dash="dot"), showlegend=True))

# Display the plot
fig.show()
pio.write_html(fig, 'feature_importance_and_threshold.html', full_html=False)

n            55.000000
threshold     0.005190
accuracy      0.749254
precision     0.760125
recall        0.728358
f1            0.743902
Name: 127, dtype: float64


In [None]:
# Feature importance for best precision model

feature_importance_df = pd.DataFrame({
    'Feature': best_precision_model_selection.get_feature_names_out(X_train.columns),
    'Importance': best_precision_model.feature_importances_,
}).sort_values('Importance', ascending=False)
max_y = feature_importance_df['Importance'].max()

fig = go.Figure(data=[
    go.Bar(
        x=feature_importance_df['Feature'],
        y=feature_importance_df['Importance'],
        marker_color='rgb(55, 83, 109)',
        name="Feature Importance"
    )
])

line_settings = dict(color="Red", width=1, dash="dot")
layout_settings = dict(
    title="Feature Importances of Precision Model",
    xaxis=dict(showticklabels=False),
    yaxis=dict(title='Importance', type='linear'),
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white',
)

fig.update_layout(**layout_settings)

# Display the plot
fig.show()
pio.write_html(fig, 'feature_importance_precision_model.html', full_html=False)

In [None]:
# Naive betting with the model.

def calculate_profit_trace(y_pred_proba, y_pred, y_test):
  # Calculate profits/losses for each match
  profit_trace = [0]
  for i, (prob, pred, true) in enumerate(zip(np.max(y_pred_proba, axis=1), y_pred, y_test)):
      # Only bet if certainty above p_thresh
      p_thresh = 0
      if prob >= p_thresh:
          bet = prob * 100  # Bet amount proportional to the model's certainty
          if pred == true:
              # If prediction is correct, add the bet amount to the last profit
              profit_trace.append(profit_trace[-1] + bet)
          else:
              # If prediction is incorrect, subtract the bet amount from the last profit
              profit_trace.append(profit_trace[-1] - bet)
      else:
          # If the model's certainty is below 0.75, we don't bet and the profit remains the same
          profit_trace.append(profit_trace[-1])
  return profit_trace

#Graph
select_X_test = best_precision_model_selection.transform(X_test_transformed)
y_pred_proba = best_precision_model.predict_proba(select_X_test)
y_pred = best_precision_model.predict(select_X_test)
profit_trace_series_precision = pd.Series(calculate_profit_trace(y_pred_proba, y_pred, y_test))

y_pred_proba = best_model.predict_proba(X_test_transformed)
y_pred = best_model.predict(X_test_transformed)
profit_trace_series = pd.Series(calculate_profit_trace(y_pred_proba, y_pred, y_test))

y_pred_proba_random = np.random.uniform(size=(len(y_test), len(np.unique(y_test))))
y_pred_random = np.random.choice(np.unique(y_test), size=len(y_test))
profit_trace_series_random = pd.Series(calculate_profit_trace(y_pred_proba_random, y_pred_random, y_test))

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=profit_trace_series_precision.index,
    y=profit_trace_series_precision.values,
    mode='lines',
    name='Step 2: Precision Feature Selection'
))

fig.add_trace(go.Scatter(
    x=profit_trace_series.index,
    y=profit_trace_series.values,
    mode='lines',
    name='Step 1: Hyperparameter Tuning'
))

fig.add_trace(go.Scatter(
    x=profit_trace_series_random.index,
    y=profit_trace_series_random.values,
    mode='lines',
    name='Random Model'
))

fig.update_layout(
    title="Improved Earnings Over Eval Bets",
    xaxis_title="Bet Count",
    yaxis_title="Cumulative Earnings",
    autosize=False,
    width=900,
    height=500,
    plot_bgcolor='white'
)

fig.show()
pio.write_html(fig, 'cumul_earnings_eval_bets.html', full_html=False)