In [2]:
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import numpy as np
import re
import pickle
import pandas as pd
import re

# Data Cleaning and Feature Engineering

In [3]:
team_lookup = pickle.load(open('teams_lookup.pickle', "rb" ))

In [None]:
gamelog = pickle.load(open('gamelog.pickle', "rb" ))

In [None]:
games = pd.DataFrame.from_dict(gamelog, orient = "index")
columns = ['url','Team','Date','Court','Opponent','Result','Tm','Opp','FG','FGA','FG%','3P','3PA','3P%','FT','FTA',
           'FT%','ORB','TRB','AST','STL','BLK','TOV','PF','blank','FG_O','FGA_O','FG%_O','3P_O','3PA_O','3P%_O','FT_O',
           'FTA_O','FT%_O','RB_O','TRB_O','AST_O','STL_O','BLK_O','TOV_O','PF_O']
games.columns=columns
games.drop(columns='blank');

In [None]:
#formatting data frame
games['Date'] = pd.to_datetime(games['Date'])
num_cols_basic = ['Tm','Opp','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','TRB','AST','STL','BLK',
                  'TOV','PF','blank','FG_O','FGA_O','FG%_O','3P_O','3PA_O','3P%_O','FT_O','FTA_O','FT%_O','RB_O',
                  'TRB_O','AST_O','STL_O','BLK_O','TOV_O','PF_O']
for i in num_cols_basic:
    games[i] = pd.to_numeric(games[i])

#create dictionary for results, which will be used for further calculations
r_keys = ['L', 'W', 'L\xa0(1 OT)', 'L\xa0(2 OT)', 'W\xa0(2 OT)','W\xa0(1 OT)', 'W\xa0(3 OT)', 'L\xa0(3 OT)']
r_wins = [0,1,0,0,1,1,1,0] #1 for win
r_OT = [0,0,1,2,2,1,3,3] #overtimes played
r_MP = [40,40,45,50,50,45,55,55] #minutes played

results_dict = dict(zip(r_keys,zip(r_wins,r_OT,r_MP)))

#adding columns to help with further data manipulation
games['Wins'] = [results_dict[x][0] for x in games.Result]
games['OT'] = [results_dict[x][1] for x in games.Result]
games['MP']= [results_dict[x][2] for x in games.Result]
games['PD'] = games['Tm']-games['Opp']

## Advanced_games

In [None]:
advanced_gamelog = pickle.load(open('advanced_gamelog.pickle', "rb" ))

In [None]:
advanced_games = pd.DataFrame.from_dict(advanced_gamelog, orient = "index")

In [None]:
columns = ['url','Team','Date','Court','Opponent','Result','Tm','Opp','ORtg','DRtg','Pace','FTr','3PAr','TS%','TRB%',
           'AST%','STL%','BLK%','blank1','OeFG%','OTOV%','ORB%','OFT/FGA','blank2','DeFG%','DTOV%','DRB%','DFT/FGA']
advanced_games.columns=columns
advanced_games.drop(columns=['blank1','blank2']);

In [None]:
advanced_games['Date'] = pd.to_datetime(advanced_games['Date'])
num_cols_advanced = ['Tm','Opp','ORtg','DRtg','Pace','FTr','3PAr','TS%','TRB%',
           'AST%','STL%','BLK%','blank1','OeFG%','OTOV%','ORB%','OFT/FGA','blank2','DeFG%','DTOV%','DRB%','DFT/FGA']
for i in num_cols_advanced:
    advanced_games[i] = pd.to_numeric(advanced_games[i])

In [None]:
#adding columns to help with further data manipulation
advanced_games['Wins'] = [results_dict[x][0] for x in advanced_games.Result]
advanced_games['OT'] = [results_dict[x][1] for x in advanced_games.Result]
advanced_games['MP']= [results_dict[x][2] for x in advanced_games.Result]
advanced_games['PD'] = advanced_games['Tm']-advanced_games['Opp']

## Preparing clean data file

In [None]:
# Final DF
df = pd.DataFrame(games['url'])
df['Team'] = [team_lookup[x][0] for x in games['Team']]
df['Date'] = games['Date']
df['Opponent'] = games['Opponent']
df['Court'] = games['Court']
df['PD'] = games['PD']
# GP = Games Played prior to this game
df['GP'] = games.groupby('Team')['Team'].transform(lambda x: x.expanding().count()-1)
# get prior average stats for each team before the game
for i in ['Tm','Opp','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','TRB','AST','STL','BLK',
                  'TOV','PF','FG_O','FGA_O','FG%_O','3P_O','3PA_O','3P%_O','FT_O','FTA_O','FT%_O','RB_O',
                  'TRB_O','AST_O','STL_O','BLK_O','TOV_O','PF_O','Wins','OT','MP']:
    df[i] = games.groupby('Team')[i].transform(lambda x: x.expanding().sum()-x)/df['GP']

In [None]:
def data_manip(games,columns,team_lookup=team_lookup):
    '''
    this helps to clean data. update this here
    games = gamelog information by team ______
    columns = names of columns in the dataframe
    team_lookup = to get the right name
    
    output: dataframe cleaned!

    '''
    df = pd.DataFrame(games['url'])
    df['Team'] = [team_lookup[x][0] for x in games['Team']]
    df['Date'] = games['Date']
    df['Opponent'] = games['Opponent']
    df['Court'] = games['Court']
    df['PD'] = games['PD']
    df['GP'] = games.groupby('Team')['Team'].transform(lambda x: x.expanding().count()-1)
    for i in columns:
        df[i] = games.groupby('Team')[i].transform(lambda x: x.expanding().sum()-x)/df['GP']
    return df

In [None]:
adf = data_manip(advanced_games,num_cols_advanced)