In [139]:
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import numpy as np
import re
import pickle
import pandas as pd
import re
import matplotlib.pyplot as plt

# Data Cleaning and Feature Engineering

In [15]:
team_lookup = pickle.load(open('teams_lookup.pickle', "rb" ))

In [6]:
gamelog = pickle.load(open('gamelog.pickle', "rb" ))

In [7]:
games = pd.DataFrame.from_dict(gamelog, orient = "index")
columns = ['url','Team','Date','Court','Opponent','Result','Tm','Opp','FG','FGA','FG%','3P','3PA','3P%','FT','FTA',
           'FT%','ORB','TRB','AST','STL','BLK','TOV','PF','blank','FG_O','FGA_O','FG%_O','3P_O','3PA_O','3P%_O','FT_O',
           'FTA_O','FT%_O','RB_O','TRB_O','AST_O','STL_O','BLK_O','TOV_O','PF_O']
games.columns=columns
games.drop(columns='blank');

In [8]:
#formatting data frame
games['Date'] = pd.to_datetime(games['Date'])
num_cols_basic = ['Tm','Opp','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','TRB','AST','STL','BLK',
                  'TOV','PF','blank','FG_O','FGA_O','FG%_O','3P_O','3PA_O','3P%_O','FT_O','FTA_O','FT%_O','RB_O',
                  'TRB_O','AST_O','STL_O','BLK_O','TOV_O','PF_O']
for i in num_cols_basic:
    games[i] = pd.to_numeric(games[i])

#create dictionary for results, which will be used for further calculations
r_keys = ['L', 'W', 'L\xa0(1 OT)', 'L\xa0(2 OT)', 'W\xa0(2 OT)','W\xa0(1 OT)', 'W\xa0(3 OT)', 'L\xa0(3 OT)']
r_wins = [0,1,0,0,1,1,1,0] #1 for win
r_OT = [0,0,1,2,2,1,3,3] #overtimes played
r_MP = [40,40,45,50,50,45,55,55] #minutes played

results_dict = dict(zip(r_keys,zip(r_wins,r_OT,r_MP)))




In [128]:
#create lookup for D1 feature gen
team_df = pd.DataFrame.from_dict(team_lookup, orient = 'index')
D1 = dict(zip(team_df.loc[:,0],np.repeat(1,len(team_df.loc[:,0]))))

In [119]:
#adding columns to help with further data manipulation
games['Wins'] = [results_dict[x][0] for x in games.Result]
games['OT'] = [results_dict[x][1] for x in games.Result]
games['MP']= [results_dict[x][2] for x in games.Result]
games['PD'] = games['Tm']-games['Opp']

## Advanced_games

In [9]:
advanced_gamelog = pickle.load(open('advanced_gamelog.pickle', "rb" ))

In [10]:
advanced_games = pd.DataFrame.from_dict(advanced_gamelog, orient = "index")

In [11]:
columns = ['url','Team','Date','Court','Opponent','Result','Tm','Opp','ORtg','DRtg','Pace','FTr','3PAr','TS%','TRB%',
           'AST%','STL%','BLK%','blank1','OeFG%','OTOV%','ORB%','OFT/FGA','blank2','DeFG%','DTOV%','DRB%','DFT/FGA']
advanced_games.columns=columns
advanced_games.drop(columns=['blank1','blank2']);

In [12]:
advanced_games['Date'] = pd.to_datetime(advanced_games['Date'])
num_cols_advanced = ['Tm','Opp','ORtg','DRtg','Pace','FTr','3PAr','TS%','TRB%',
           'AST%','STL%','BLK%','blank1','OeFG%','OTOV%','ORB%','OFT/FGA','blank2','DeFG%','DTOV%','DRB%','DFT/FGA']
for i in num_cols_advanced:
    advanced_games[i] = pd.to_numeric(advanced_games[i])

In [13]:
#adding columns to help with further data manipulation
advanced_games['Wins'] = [results_dict[x][0] for x in advanced_games.Result]
advanced_games['OT'] = [results_dict[x][1] for x in advanced_games.Result]
advanced_games['MP']= [results_dict[x][2] for x in advanced_games.Result]
advanced_games['PD'] = advanced_games['Tm']-advanced_games['Opp']

## Preparing clean data file

In [16]:
# Final DF
df = pd.DataFrame(games['url'])
df['Team'] = [team_lookup[x][0] for x in games['Team']]
df['Date'] = games['Date']
df['Opponent'] = games['Opponent']
df['Court'] = games['Court']
df['PD'] = games['PD']
# GP = Games Played prior to this game
df['GP'] = games.groupby('Team')['Team'].transform(lambda x: x.expanding().count()-1)
# get prior average stats for each team before the game
for i in ['Tm','Opp','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','TRB','AST','STL','BLK',
                  'TOV','PF','FG_O','FGA_O','FG%_O','3P_O','3PA_O','3P%_O','FT_O','FTA_O','FT%_O','RB_O',
                  'TRB_O','AST_O','STL_O','BLK_O','TOV_O','PF_O','Wins','OT','MP']:
    df[i] = games.groupby('Team')[i].transform(lambda x: x.expanding().sum()-x)/df['GP']

In [136]:
def data_manip(games,columns,team_lookup=team_lookup):
    '''
    this helps to clean data. update this here
    games = gamelog information by team ______
    columns = names of columns in the dataframe
    team_lookup = to get the right name
    
    output: dataframe cleaned!
    with columns we want
    
    D1 = indicate if opponent is in Division 1 or not (from the data collection, only D1 teams included for gamelog but they sometimes play non-D1 schools)
    '''
    df = pd.DataFrame(games['url'])
    df['Team'] = [team_lookup[x][0] for x in games['Team']]
    df['Date'] = games['Date']
    df['Opponent'] = games['Opponent']
    df['D1'] = [D1.get(x) or 0 for x in df['Opponent']] 
    df['Court'] = games['Court']
    hot_court = pd.get_dummies(games.Court)
    df['Home'],df['Away'] = hot_court[""],hot_court["@"]
    df['PD'] = games['PD']
    df['GP'] = games.groupby('Team')['Team'].transform(lambda x: x.expanding().count()-1)
    for i in columns:
        df[i] = games.groupby('Team')[i].transform(lambda x: x.expanding().sum()-x)/df['GP']
    return df

In [137]:
df = data_manip(games,num_cols_basic)
adf = data_manip(advanced_games,num_cols_advanced)

In [138]:
df[df['Team']=='Abilene Christian']

Unnamed: 0,url,Team,Date,Opponent,D1,Court,Home,Away,PD,GP,...,FT_O,FTA_O,FT%_O,RB_O,TRB_O,AST_O,STL_O,BLK_O,TOV_O,PF_O
/cbb/boxscores/2018-11-06-19-abilene-christian.htmlabilene-christian,/cbb/boxscores/2018-11-06-19-abilene-christian...,Abilene Christian,2018-11-06,Arlington Baptist,0,,1,0,53,0.0,...,,,,,,,,,,
/cbb/boxscores/2018-11-09-20-abilene-christian.htmlabilene-christian,/cbb/boxscores/2018-11-09-20-abilene-christian...,Abilene Christian,2018-11-09,Arkansas State,1,,1,0,21,1.0,...,13.0,18.0,0.722,4.0,17.0,5.0,2.0,0.0,22.0,0.0
/cbb/boxscores/2018-11-15-21-denver.htmlabilene-christian,/cbb/boxscores/2018-11-15-21-denver.html,Abilene Christian,2018-11-15,Denver,1,@,0,1,6,2.0,...,14.0,21.0,0.6735,6.5,21.5,9.0,4.0,1.0,19.0,11.5
/cbb/boxscores/2018-11-22-18-abilene-christian.htmlabilene-christian,/cbb/boxscores/2018-11-22-18-abilene-christian...,Abilene Christian,2018-11-22,Elon,1,N,0,0,16,3.0,...,10.0,17.666667,0.509667,7.333333,24.666667,9.666667,3.0,2.0,17.666667,13.666667
/cbb/boxscores/2018-11-23-20-pacific.htmlabilene-christian,/cbb/boxscores/2018-11-23-20-pacific.html,Abilene Christian,2018-11-23,Pacific,1,@,0,1,2,4.0,...,10.75,18.0,0.55325,6.75,24.5,11.0,3.0,2.0,16.0,14.0
/cbb/boxscores/2018-11-24-20-california-riverside.htmlabilene-christian,/cbb/boxscores/2018-11-24-20-california-rivers...,Abilene Christian,2018-11-24,UC-Riverside,1,N,0,0,12,5.0,...,14.2,21.8,0.594,7.0,25.4,9.8,3.6,2.2,14.6,15.0
/cbb/boxscores/2018-11-27-19-abilene-christian.htmlabilene-christian,/cbb/boxscores/2018-11-27-19-abilene-christian...,Abilene Christian,2018-11-27,Howard Payne,0,,1,0,37,6.0,...,13.333333,20.666667,0.595,7.333333,26.0,9.166667,3.333333,2.166667,14.666667,16.0
/cbb/boxscores/2018-12-01-20-pepperdine.htmlabilene-christian,/cbb/boxscores/2018-12-01-20-pepperdine.html,Abilene Christian,2018-12-01,Pepperdine,1,@,0,1,-15,7.0,...,12.714286,20.0,0.590429,6.857143,25.142857,9.285714,4.0,2.0,16.571429,15.571429
/cbb/boxscores/2018-12-04-20-abilene-christian.htmlabilene-christian,/cbb/boxscores/2018-12-04-20-abilene-christian...,Abilene Christian,2018-12-04,Campbell,1,,1,0,15,8.0,...,14.25,21.25,0.62075,6.625,26.125,10.125,4.25,2.0,16.375,15.625
/cbb/boxscores/2018-12-08-11-abilene-christian.htmlabilene-christian,/cbb/boxscores/2018-12-08-11-abilene-christian...,Abilene Christian,2018-12-08,Schreiner,0,,1,0,40,9.0,...,15.0,22.0,0.635111,6.888889,26.555556,9.666667,4.222222,2.0,17.0,16.0


In [141]:
pickle.dump(df, open("basic_stats.pkl", "wb"))
pickle.dump(adf, open("advanced_stats.pkl", "wb"))