# Minnesota T-Pups Plus/Minus Web scraping & cleaning
##### By: Mitch Brinkman

## Package Import

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import patsy


%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
from bball_func import*

## Web Data Scrape & Clean

In [None]:
#reference list for URL execution of functions
offset_vals = [0,100,200,300,400,500,600]

In [None]:
#web scraping data from basketball-reference.com

trad_df = get_trad_data(offset_vals)

In [None]:
#web scraping additional data from basketball-reference.com

non_score_df = get_non_score_data(offset_vals)

In [None]:
trad_df.sort_values('Date', axis=0, ascending=True, inplace=True)
trad_df.head()

In [None]:
non_score_df.sort_values('Date', axis=0, ascending=True, inplace=True)
non_score_df.head()

In [None]:
#unnecessary data that gets the trim

drop_columns(['Tm', 'Result', 'MP', 'TRB%',
       'AST%', 'STL%', 'BLK%', 'TOV%', 'FG', 'FGA', 'FG%', '2PA', '2P%',
        '3PA', '3P%', 'FTA', 'FT%', 'PTS', 'STL', 'BLK',
        'ORB%.1', 'TRB%.1', 'AST%.1', 'STL%.1', 'BLK%.1', 'TOV%.1',
       'FG.1', 'FGA.1', 'FG%.1', '2PA.1', '2P%.1', '3PA.1',
       '3P%.1', 'FTA.1', 'FT%.1', 'PTS.1', 'AST.1', 'STL.1', 'BLK.1',
         'ORB%.2', 'TRB%.2', 'AST%.2', 'STL%.2', 'BLK%.2',
       'TOV%.2', 'FG.2', 'FGA.2', 'FG%.2', '2P.2', '2PA.2', '2P%.2', '3P.2',
       '3PA.2', '3P%.2', 'FT.2', 'FTA.2', 'FT%.2', 'AST.2', 'STL.2',
       'BLK.2', 'TOV.2', 'PF.2'],trad_df)

In [None]:
drop_columns(['Tm', 'Result', 'MP', 'ORB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'ORB.1', 'DRB.1', 'TRB.1', 'AST.1',
       'STL.1', 'BLK.1', 'TOV.1', 'PF.1'],non_score_df)

## Merging Dataframes - Further Cleaning

In [None]:
data_table = pd.merge(trad_df,non_score_df,how='inner',on=['Date','Unnamed: 3','Opp'])

In [None]:
data_table.tail()

In [None]:
#deleting last pesky row that was holding on and causing errors down the road

data_table = data_table[:-1]

In [None]:
data_table.columns

In [None]:
#differentiating between MIN and opponents' stat lines

opp_col_rename(['FT.1', '3P.1',
       'AST.1', 'TOV.1','2P.1','PF.1','AST%.1'],data_table)

In [None]:
data_table.rename(columns={'Unnamed: 3':'Home','PTS.2':'Plus_Minus','PF.2':'Foul_Diff','ORB%':'orb_pct'}, inplace = True)
data_table.columns = map(str.lower, data_table.columns)
data_table.columns = data_table.columns.str.replace(" ","_")

In [None]:
#renaming of 2p & 3p columns as these arguments wouldn't take in above cell for whatever reason

data_table.rename(columns={'2p':'_2p','3p':'_3p'}, inplace = True)

In [None]:
#converting Home/Away column from string and null to 1's and 0's.

data_table['home'].fillna(1, inplace=True)
data_table.replace(to_replace='@',value=0, inplace=True)

In [None]:
data_table['date'] = pd.to_datetime(data_table['date'])

In [None]:
data_table['days_rest'] = data_table['date'].diff().dt.days -1
data_table['days_rest'].values[data_table['days_rest'].values > 10] = 0
data_table['days_rest'].fillna(0, inplace=True)

In [None]:
data_table.columns

In [None]:
#Converting statistics to integers from object

make_integers(['home','_2p', '_3P', 'ft', 'ast', 'tov',
       'pf', 'opp_2p.1', 'opp_3p.1', 'opp_ft.1', 'opp_tov.1', 'opp_pf.1',
       'plus_minus', 'drb'], data_table)

In [None]:
#Converting ORB% to float from object to maintain integrity of the percentage

float_names = ['orb_pct']
for name in float_names:
    data_table[name] = data_table[name].astype(float)

In [None]:
#saving all the work that's been done!

data_table.to_pickle('NBA_data_table.pickle')

## Rolling Average Data Table

In [None]:
data_table.columns

In [None]:
#selecting the data table to move forward with to create the rolling average table

avg_data_table = data_table.loc[:,[ 'date', 'plus_minus','home', 'opp', 'orb_pct', '_2p', '_3P', 'ft', 'ast', 'tov',
       'pf','drb', 'opp_2p.1', 'opp_3p.1', 'opp_ft.1', 'opp_tov.1', 'opp_pf.1',
       'days_rest']]

In [None]:
#Restricting float outputs to two decimal places for easier reading

pd.options.display.float_format = "{:,.2f}".format

In [None]:
avg_data_table.head(10)

In [None]:
#Creating multiple columns for 5 game rolling averages

avg_data_table['orb_pct avg'] = avg_data_table.orb_pct.rolling(5,min_periods=1).mean()
avg_data_table['_2p_avg'] = avg_data_table._2p.rolling(5,min_periods=1).mean()
avg_data_table['_3p_avg'] = avg_data_table._3P.rolling(5,min_periods=1).mean()
avg_data_table['ft_avg'] = avg_data_table.ft.rolling(5,min_periods=1).mean()
avg_data_table['ast_avg'] = avg_data_table.ast.rolling(5,min_periods=1).mean()
avg_data_table['tov_avg'] = avg_data_table.tov.rolling(5,min_periods=1).mean()
avg_data_table['pf_avg'] = avg_data_table.pf.rolling(5,min_periods=1).mean()
avg_data_table['opp_2p_avg'] = avg_data_table['opp_2p.1'].rolling(5,min_periods=1).mean()
avg_data_table['opp_3p_avg'] = avg_data_table['opp_3p.1'].rolling(5,min_periods=1).mean()
avg_data_table['opp_ft_avg'] = avg_data_table['opp_ft.1'].rolling(5,min_periods=1).mean()
avg_data_table['opp_tov_avg'] = avg_data_table['opp_tov.1'].rolling(5,min_periods=1).mean()
avg_data_table['opp_pf_avg'] = avg_data_table['opp_pf.1'].rolling(5,min_periods=1).mean()
avg_data_table['drb_avg'] = avg_data_table.drb.rolling(5,min_periods=1).mean()


In [None]:
avg_data_table.to_pickle('avg_data_table.pickle')

## EDA on Data

In [None]:
avg_data_table.columns

In [None]:
#Stats generated on the offensive possesion of a game by MIN

off_data_table = avg_data_table.loc[:,['plus_minus','home','days_rest', 'orb_pct avg', '_2p_avg', '_3p_avg', 'ft_avg',
       'ast_avg', 'tov_avg', 'pf_avg','days_rest']]

In [None]:
#Stats generated on the defensive possesion of a game by MIN

def_data_table = avg_data_table.loc[:,['plus_minus','home','opp_2p_avg', 'opp_3p_avg',
       'opp_ft_avg', 'opp_tov_avg', 'opp_pf_avg', 'drb_avg','days_rest']]

In [None]:
pickle_data_table = avg_data_table.loc[:,['date','home','opp','plus_minus','days_rest', 'orb_pct avg', '_2p_avg', '_3p_avg', 'ft_avg',
       'ast_avg', 'tov_avg', 'pf_avg','opp_2p_avg', 'opp_3p_avg',
       'opp_ft_avg', 'opp_tov_avg', 'opp_pf_avg', 'drb_avg','days_rest']]

In [None]:
#rolling average only table of all statistics

EDA_data_table = avg_data_table.loc[:,['plus_minus','days_rest', 'orb_pct avg', '_2p_avg', '_3p_avg', 'ft_avg',
       'ast_avg', 'tov_avg', 'pf_avg','opp_2p_avg', 'opp_3p_avg',
       'opp_ft_avg', 'opp_tov_avg', 'opp_pf_avg', 'drb_avg','days_rest']]

In [None]:
pickle_data_table.to_pickle('final_data_table.pickle')

In [None]:
sns.lmplot(x='days_rest',y='plus_minus',data=avg_data_table)

In [None]:
plt.figure(figsize=(15,12))
sns.set(font_scale = 1.4)
# sns.set_style("ticks",{'xtick.major_size':2})
sns.lmplot(data=EDA_data_table, x='drb_avg',y='plus_minus')
plt.title('Def Rebounds vs. Plus/Minus')
plt.xlabel("Def Rebound Avg")
plt.ylabel('Plus / Minus');
# plt.savefig('DRBvPTDIFF.png');

In [None]:
#Offensive statistics heat map

plt.figure(figsize=(40,35))
sns.heatmap(off_data_table.corr(), cmap="seismic", annot=True, vmin=-1, vmax=1);

In [None]:
#Defensive statistics heat map

plt.figure(figsize=(40,35))
sns.heatmap(def_data_table.corr(), cmap="seismic", annot=True, vmin=-1, vmax=1);

In [None]:
sns.pairplot(EDA_data_table)

In [None]:
off_data_table.corr()

In [None]:
def_data_table.corr()