In [4]:
import pandas as pd
import datetime as dt
import os

import statsmodels.formula.api as sm
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [5]:
def apply_models(branch='men', lookback_years=3, desired_features=-1, force=False):
    this_year = dt.datetime.now().year
    season_dates = pd.read_csv('../data/season_dates.csv',
                               index_col='season_end_year',
                               infer_datetime_format=True,
                               parse_dates=['season_start_date','tournament_start_date','season_end_date'])
    
    # Get index of year in list of season years
    year_ix = season_dates.index.get_loc(this_year) + 1
    # Get list of years to consider in model
    extraction_years = season_dates.index[year_ix-lookback_years:year_ix].tolist()
    
    

In [6]:
# Determine lookback period
branch='women'
lookback_years = 3
this_year = dt.datetime.now().year

season_dates = pd.read_csv('../data/season_dates.csv',
                           index_col='season_end_year',
                           infer_datetime_format=True,
                           parse_dates=['season_start_date','tournament_start_date','season_end_date'])

year_ix = season_dates.index.get_loc(this_year) + 1
extraction_years = season_dates.index[year_ix-lookback_years:year_ix].tolist()

sos_scaled = False

if this_year not in extraction_years:
    raise Exception(f'Season dates for season ending in {this_year} are not included in ../data/season_dates.csv')
    
# desired_features = ['won','fan_favorite','de','oe','te','pace','physicality_score','luck','srs_norm',
#                   'fan_favorite_opp','de_opp','oe_opp','te_opp','pace_opp','physicality_score_opp','luck_opp','srs_norm_opp',
#                   'choke_rate','upset_rate','choke_rate_opp','upset_rate_opp','win_streak','win_streak_opp']

desired_features = ['won','fan_favorite','de','oe','te','pace','physicality_score','luck','srs_norm',
                  'fan_favorite_opp','de_opp','oe_opp','te_opp','pace_opp','physicality_score_opp','luck_opp','srs_norm_opp',
                  'choke_rate','upset_rate','choke_rate_opp','upset_rate_opp','win_streak','win_streak_opp','matchup_win_rate']

In [8]:
def create_models(features,extraction_years,season_dates,this_year,
                  lookback_years,branch,force=False):
    """
    Create and pickle models to be used in final match forecast
    :param features: list of model features to be consider
    :param extraction_years: list of integer years to consider in model
    :param season_dates: list of dates to consider
    :param this_year: integer indicating current year
    :param lookback_years: integer indicating of years the model should lookback
    :param branch: string indicating sex of athletes - should be 'men' or 'women'
    :param force: boolean indicating whether should be forced to extract if it already is cached
    returns None
    """
    tranformed_filepath = f'../data/transformed_data_{branch}{str(this_year)[-2:]}.csv'
    if os.path.exists(tranformed_filepath) & (not force):
        data = pd.read_csv(transformed_filepath)
    else:
        %run ../extract/transform_data.ipynb
        data = transform_data(
            extraction_years,season_dates,this_year,lookback_years,branch,force=force)
        
    # Only use desired columns
    df = data[features]
    df.fillna(0,inplace=True)
    
    # Assign response and prediction variables
    resp_var = df.columns[0]
    pred_vars = df.columns[1:].tolist()

    train, test = train_test_split(df,test_size=0.1)

    logit_phrase = resp_var + ' ~ ' + ' + '.join(pred_vars)

    log_reg = sm.logit(logit_phrase, data=train).fit(method='bfgs')
    
    

In [9]:
def create_model(df, response_var, pred_vars, save_name, eda=False):
    """
    Creates and pickles a model
    """

    train, test = train_test_split(df,test_size=0.1)

    logit_phrase = response_var + ' ~ ' + ' + '.join(pred_vars)

    log_reg = sm.logit(logit_phrase, data=train).fit(method='bfgs')
    
    if eda:
        print(save_name)
        odds_ratios = get_odds_ratios(log_reg, save_name)
        conf_matrix = get_confusion_matrix(log_reg,test, response_var)
    
    log_reg.save(f"{save_name}.pickle")
    return log_reg

In [10]:
# Gather data for models
%run ../extract/transform_data.ipynb
data = transform_data(extraction_years,season_dates,this_year,lookback_years,branch,force=False)
# data.rename(columns = {'match_win_rate':'matchup_win_rate'},inplace=True)
data.head()

data = data[desired_features]
data.fillna(0,inplace=True)

In [11]:
data

Unnamed: 0,won,fan_favorite,de,oe,te,pace,physicality_score,luck,srs_norm,fan_favorite_opp,...,physicality_score_opp,luck_opp,srs_norm_opp,choke_rate,upset_rate,choke_rate_opp,upset_rate_opp,win_streak,win_streak_opp,matchup_win_rate
0,1,0,86.218005,107.948222,1.252038,63.0,59.173545,0.500000,37.482236,1,...,54.533951,0.600000,45.416864,0.0,0.0,0.000000,0.0,0.000000,0.354424,0.000000
1,0,0,86.218005,107.948222,1.252038,63.0,59.173545,0.500000,37.482236,1,...,58.705865,0.875000,78.019896,0.0,0.0,0.107143,0.0,0.315176,3.661989,0.000000
2,0,0,86.218005,107.948222,1.252038,63.0,59.173545,0.500000,37.482236,1,...,43.780332,0.166667,43.581241,0.0,0.0,0.000000,0.0,0.000000,0.801142,0.000000
3,1,0,86.218005,107.948222,1.252038,63.0,59.173545,0.500000,37.482236,1,...,49.070678,0.444444,35.859782,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000
4,1,0,86.218005,107.948222,1.252038,63.0,59.173545,0.500000,37.482236,1,...,49.601359,0.000000,23.342018,0.0,0.0,0.000000,0.0,0.315176,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26169,0,1,87.136929,98.413473,1.129412,68.0,31.236491,0.571429,46.127047,0,...,32.717652,0.333333,42.085704,0.0,0.0,0.000000,0.0,0.559903,0.000000,0.500000
26170,0,1,87.136929,98.413473,1.129412,68.0,31.236491,0.571429,46.127047,0,...,27.811140,0.750000,58.773662,0.0,0.0,0.000000,0.0,0.000000,2.632696,0.333333
26171,0,0,87.136929,98.413473,1.129412,68.0,31.236491,0.571429,46.127047,1,...,27.704786,0.600000,44.408315,0.0,0.0,0.000000,0.0,0.000000,0.378829,0.000000
26172,1,0,87.136929,98.413473,1.129412,68.0,31.236491,0.571429,46.127047,1,...,37.306855,1.000000,34.699803,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.833333


In [26]:
np.array(desired_features)[np.where(np.array(desired_features) != 'won')].tolist()

['fan_favorite',
 'de',
 'oe',
 'te',
 'pace',
 'physicality_score',
 'luck',
 'srs_norm',
 'fan_favorite_opp',
 'de_opp',
 'oe_opp',
 'te_opp',
 'pace_opp',
 'physicality_score_opp',
 'luck_opp',
 'srs_norm_opp',
 'choke_rate',
 'upset_rate',
 'choke_rate_opp',
 'upset_rate_opp',
 'win_streak',
 'win_streak_opp',
 'matchup_win_rate']

In [22]:
select_features(desired_features,data,'won')[1]

         Current function value: 0.389617
         Iterations: 35
         Function evaluations: 41
         Gradient evaluations: 41




0.8206303724928367

In [21]:
def select_features(cols,data,response_var):
    possible_variable = np.array(cols)[np.where(np.array(cols) != response_var)].tolist()
    df = data[cols]
    df.fillna(0,inplace=True)
    resp_var = df.columns[0]
    pred_vars = df.columns[1:].tolist()

    train, test = train_test_split(df,test_size=0.2)

    logit_phrase = resp_var + ' ~ ' + ' + '.join(pred_vars)

    log_reg = sm.logit(logit_phrase, data=train).fit(method='bfgs')
    
    y_pred = log_reg.predict(test)
    prediction = list(map(round, y_pred))
    accuracy = accuracy_score(test[response_var], prediction)
    return cols,accuracy

In [47]:
def get_odds_ratios(model, save_name):
    # Build a dataframe with odds ratios and their confidence intervals.
    odds_ratios = pd.DataFrame(
        {
            "OR": model.params,
            "Lower CI": model.conf_int()[0],
            "Upper CI": model.conf_int()[1],
        }
    )
    # Exponentiate the data frame
    odds_ratios = np.exp(odds_ratios)

    # Print the odds ratios
#     or_string = 
    print('\nOdds Ratios:')
    print(odds_ratios.sort_values('OR',ascending=False))
    odds_ratios.sort_values('OR',ascending=False).to_csv(f'../eda/{save_name}_odds_ratios.csv')
    
    # Print prediction table
    print('\nPrediction table:')
    print(model.pred_table())
    np.savetxt(f'../eda/{save_name}_prediction_table.out',model.pred_table(),
               header=f'{save_name} prediction table')
    
    return None

In [50]:
def get_confusion_matrix(model, validation, response_var):
    y_pred = model.predict(validation)
    prediction = list(map(round, y_pred))

    # Create a sklearn confusion matrix
    cm = confusion_matrix(validation[response_var], prediction)
    print ("\nConfusion Matrix : \n", cm)

    # We also can use sklearn to calculate the overall accuracy of the model.  A flawed, but helpful metric.
    print('\nTest accuracy = ', accuracy_score(validation[response_var], prediction))
    
    # Print classfication report
    print('\nClassification Report: \n' , classification_report(validation[response_var],prediction))
    
    return None