In [720]:
import pandas as pd
import numpy as np

In [721]:
init_data = pd.read_csv('test.csv')
X_data = init_data
y_data = [X_data['PTS Home'] > X_data['PTS Visitor']]
y_data = np.array(y_data).T

In [722]:
X_data.info()
X_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1230 entries, 0 to 1229
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         1230 non-null   object
 1   Start(ET)    1230 non-null   object
 2   Visitor      1230 non-null   object
 3   PTS Visitor  1230 non-null   int64 
 4   Home         1230 non-null   object
 5   PTS Home     1230 non-null   int64 
 6   OT           76 non-null     object
 7   Attending    1230 non-null   object
dtypes: int64(2), object(6)
memory usage: 77.0+ KB


Unnamed: 0,Date,Start(ET),Visitor,PTS Visitor,Home,PTS Home,OT,Attending
0,"Tue, Oct 28, 2014",8:00p,Orlando Magic,84,New Orleans Pelicans,101,,17097
1,"Tue, Oct 28, 2014",8:00p,Dallas Mavericks,100,San Antonio Spurs,101,,19615
2,"Tue, Oct 28, 2014",10:30p,Houston Rockets,108,Los Angeles Lakers,90,,18997
3,"Wed, Oct 29, 2014",7:00p,Milwaukee Bucks,106,Charlotte Hornets,108,OT,19439
4,"Wed, Oct 29, 2014",7:00p,Philadelphia 76ers,91,Indiana Pacers,103,,18165


## Cleaning Data

In [723]:
X_data.drop(['Date', 'Start(ET)', 'PTS Home', 'PTS Visitor'], axis=1, inplace=True)
X_data['OT'].fillna(0, inplace=True)
X_data['OT'].replace(['OT', '2OT', '3OT'], 1, inplace=True)
X_data['Attending'].replace(',', '', inplace=True, regex=True)
X_data['Attending'] = X_data['Attending'].astype(int)

## Label Encode Teams

In [724]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

labelencoder = LabelEncoder()
X_data['Visitor'] = labelencoder.fit_transform(X_data['Visitor'].values)
X_data['Home'] = labelencoder.fit_transform(X_data['Home'].values)
X_teams = np.vstack([X_data['Visitor'], X_data['Home']]).T


## Add Features

### Win Last Game?

In [725]:
# if won last game
def set_win_last(X_data, y_data):
    last_won = {}
    home_last_won = []
    visitor_last_won = []

    for index, game in X_data.iterrows():
        if game['Home'] in last_won:
            home_last_won.append(last_won[game['Home']])
            last_won[game['Home']] = y_data.flatten()[index]
        else:
            home_last_won.append(False)
            last_won[game['Home']] = y_data.flatten()[index]

        if game['Visitor'] in last_won:
            visitor_last_won.append(last_won[game['Visitor']])
            last_won[game['Visitor']] = not y_data.flatten()[index]
        else:
            visitor_last_won.append(False)
            last_won[game['Visitor']] = not y_data.flatten()[index]

    return home_last_won, visitor_last_won

In [726]:
X_data['Home Prev'], X_data['Visitor Prev'] = set_win_last(X_data, y_data)

### Win Streak

In [727]:
def set_win_streak(X_data, y_data):
    win_streaks = {}
    win_streaks = win_streaks.fromkeys(range(30), 0)
    home_win_streaks = []
    visitor_win_streaks = []
    
    for index, game in X_data.iterrows():
        home_win_streaks.append(win_streaks[game['Home']])
        visitor_win_streaks.append(win_streaks[game['Visitor']])
        if y_data.flatten()[index] == True:
            win_streaks[game['Home']] += 1
            win_streaks[game['Visitor']] = 0
        else:
            win_streaks[game['Visitor']] += 1
            win_streaks[game['Home']] = 0
    
    return home_win_streaks, visitor_win_streaks

In [728]:
X_data['Home Win Streak'], X_data['Visitor Win Streak'] = set_win_streak(X_data, y_data)

### Number of Wins

In [729]:
def set_tot_wins(X_data, y_data):
    tot_wins = {}
    tot_wins = tot_wins.fromkeys(range(30), 0)
    tot_games = {}
    tot_games = tot_games.fromkeys(range(30), 0)
    home_ratios = []
    visitor_ratios = []
    
    for index, game in X_data.iterrows():
        home_win_ratio = tot_wins[game['Home']] / tot_games[game['Home']] if tot_games[game['Home']] > 0 else 0
        home_ratios.append(home_win_ratio)
        visitor_win_ratio = tot_wins[game['Visitor']] / tot_games[game['Visitor']] if tot_games[game['Visitor']] > 0 else 0
        visitor_ratios.append(visitor_win_ratio)
        if y_data.flatten()[index] == True:
            tot_wins[game['Home']] += 1
        else:
            tot_wins[game['Visitor']] += 1
        
        tot_games[game['Home']] += 1
        tot_games[game['Visitor']] += 1
    
    return home_ratios, visitor_ratios

In [730]:
X_data['Home Wins'], X_data['Visitor Wins'] = set_tot_wins(X_data, y_data)

## More Cleaning

In [731]:
X_data.drop(['Attending'], axis=1, inplace=True)

In [732]:
X_data

Unnamed: 0,Visitor,Home,OT,Home Prev,Visitor Prev,Home Win Streak,Visitor Win Streak,Home Wins,Visitor Wins
0,21,18,0,False,False,0,0,0.000000,0.000000
1,6,26,0,False,False,0,0,0.000000,0.000000
2,10,13,0,False,False,0,0,0.000000,0.000000
3,16,3,1,False,False,0,0,0.000000,0.000000
4,22,11,0,False,False,0,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
1225,8,19,0,True,False,2,0,0.209877,0.382716
1226,15,22,0,False,True,0,1,0.222222,0.444444
1227,11,14,0,False,True,0,6,0.666667,0.469136
1228,7,9,0,True,False,3,0,0.814815,0.370370


## One Hot Encode Teams

In [733]:
ohe = OneHotEncoder(sparse=False)
X_teams = ohe.fit_transform(X_teams)

new_X_data = X_data.drop(['Home', 'Visitor'], axis=1)

new_X_data = pd.concat([new_X_data, pd.DataFrame(X_teams)], axis=1)

## Split data

In [734]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(new_X_data, y_data, test_size=0.2, random_state=12)

## Model fitting/testing

In [735]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [736]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=0)

dt_scores = cross_val_score(dt_model, X_train, y_train.ravel(), cv=10)

print('Training accuracy', dt_scores.mean())

Training accuracy 0.5722634508348794


In [737]:
from sklearn.ensemble import RandomForestClassifier

param_rf = { 
    'n_estimators': range(40, 140, 20), 'max_depth': [5, 10, 15, 20, 25]
}

rf_model = RandomForestClassifier(random_state=0, max_depth=10, n_estimators=40)
rf_scores = cross_val_score(rf_model, X_train, y_train.ravel(), cv=10)

# gs_rf = GridSearchCV(rf_model, param_grid=param_rf, cv=10)
# gs_rf.fit(X_train, y_train.ravel())

print('Cross Val Score:', rf_scores.mean())

Cross Val Score: 0.666707895279324


In [738]:
gs_rf.best_params_

{'max_depth': 10, 'n_estimators': 40}

In [739]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=0, max_iter=500)

lr_scores = cross_val_score(lr_model, X_train, y_train.ravel(), cv=10)

print('Training accuracy', lr_scores.mean())

Training accuracy 0.6636878994021851


In [611]:
y_train.ravel().shape

(984,)

In [742]:
rf_model.fit(X_train, y_train.ravel())
rf_model.score(X_test, y_test)

0.6544715447154471