### Import Modules

In [1]:
# general
import pandas as pd
import numpy as np
import requests as rq
import re
import random

#sklearn
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, roc_curve, auc, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, svm, exceptions, tree
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV, cross_validate

# scraping
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact, interactive, HBox, VBox, interactive_output
from ipywidgets import widgets

from IPython.display import display
plt.style.use('fivethirtyeight')

#### Create list of MLB teams

In [45]:
teams = pd.read_csv('mlb_teams.csv')
teams.head()

Unnamed: 0,Teams,Name,Abbrev
0,Atlanta Braves,Braves,ATL
1,Baltimore Orioles,Orioles,BAL
2,Boston Red Sox,Red Sox,BOS
3,Chicago Cubs,Cubs,CHC
4,Chicago White Sox,White Sox,CWS


#### Get projected starting pitchers

In [46]:
page = rq.get('https://www.baseball-reference.com/previews/index.shtml')
soup = bs(page.content, 'html.parser')
proj_starters = soup.findAll('div', attrs={'class':'game_summaries'})
proj_starters = [re.findall('">(.*)</a', str(proj_starters))]
flattened_pitchers  = [val for sublist in proj_starters for val in sublist]
proj_sp = pd.DataFrame(flattened_pitchers, columns=['name'])
proj_sp['team'] = proj_sp.name.isin(teams.Name).astype(int)
proj_sp = proj_sp[proj_sp['team'] != 1]
proj_sp = proj_sp[proj_sp['name'] != 'Preview']
proj_sp = proj_sp.drop(['team'], axis=1)
proj_sp.head(32)

Unnamed: 0,name
3,Mike Soroka
4,Anibal Sanchez
8,Dario Agrazal
9,Luis Castillo
13,Zack Greinke
14,Masahiro Tanaka
18,Jacob Waguespack
19,Jakob Junis
23,Hyun-Jin Ryu
24,German Marquez


#### Get projected starting lineups

In [48]:
driver = webdriver.Chrome(ChromeDriverManager().install());
driver.implicitly_wait(30);
driver.get('https://www.fantasylabs.com/mlb/lineups/')
soup = bs(driver.page_source, 'html.parser')
proj_lineups = [text.contents for text in soup.findAll('span', attrs={'ng-click':'openPlayerDetails(player,sportevent.EventId, $event)'})]
flattened_lineups = [val for sublist in proj_lineups for val in sublist]
proj_hit = pd.DataFrame(flattened_lineups, columns=['name'])
driver.quit()
proj_hit.head()


Checking for mac64 chromedriver:75.0.3770.140 in cache
Driver found in /Users/matt/.wdm/chromedriver/75.0.3770.140/mac64/chromedriver


Unnamed: 0,name
0,JaCoby Jones
1,Nick Castellanos
2,Niko Goodrum
3,Brandon Dixon
4,Jeimer Candelario


#### Load player pool

In [49]:
player_pool = pd.read_csv('DKSalaries.csv')
player_pool.head()

Unnamed: 0,Position,Name + ID,Name,ID,Roster Position,Salary,Game Info,TeamAbbrev,AvgPointsPerGame
0,SP,Gerrit Cole (13052286),Gerrit Cole,13052286,P,12000,HOU@CLE 07/31/2019 07:10PM ET,HOU,26.68
1,SP,Jacob deGrom (13051651),Jacob deGrom,13051651,P,11800,NYM@CWS 07/31/2019 08:10PM ET,NYM,21.45
2,SP,Justin Verlander (13052287),Justin Verlander,13052287,P,11400,HOU@CLE 07/31/2019 07:10PM ET,HOU,27.0
3,SP,Mike Clevinger (13052288),Mike Clevinger,13052288,P,11400,HOU@CLE 07/31/2019 07:10PM ET,CLE,23.02
4,SP,Trevor Bauer (13052289),Trevor Bauer,13052289,P,11100,HOU@CLE 07/31/2019 07:10PM ET,CLE,21.21


#### Filter for starting pitchers

In [50]:
pitchers = player_pool[player_pool['Roster Position'] == 'P']
pitchers['Starter'] = pitchers.Name.isin(proj_sp.name).astype(int)
pitchers = pitchers[pitchers['Starter'] != 0]
pitchers = pitchers[['Name', 'Game Info', 'TeamAbbrev']]
pitchers.head(32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Name,Game Info,TeamAbbrev
1,Jacob deGrom,NYM@CWS 07/31/2019 08:10PM ET,NYM
9,Jose Berrios,MIN@MIA 07/31/2019 07:10PM ET,MIN
13,Lucas Giolito,NYM@CWS 07/31/2019 08:10PM ET,CWS
18,Mike Minor,SEA@TEX 07/31/2019 08:05PM ET,TEX
39,Kyle Hendricks,CHC@STL 07/31/2019 08:15PM ET,CHC
61,Jeff Samardzija,SF@PHI 07/31/2019 07:05PM ET,SF
79,Miles Mikolas,CHC@STL 07/31/2019 08:15PM ET,STL
90,Vince Velasquez,SF@PHI 07/31/2019 07:05PM ET,PHI
100,Zach Plesac,HOU@CLE 07/31/2019 07:10PM ET,CLE
120,Jose Urquidy,HOU@CLE 07/31/2019 07:10PM ET,HOU


#### Filter for hitters in lineups

In [51]:
starters = player_pool[player_pool['Roster Position'] != 'P']
starters['Starter'] = starters.Name.isin(proj_hit.name).astype(int)
starters = starters[starters['Starter'] != 0]
starters = starters[['Name', 'Game Info', 'TeamAbbrev']]
starters.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Name,Game Info,TeamAbbrev
203,Christian Yelich,MIL@OAK 07/31/2019 10:07PM ET,MIL
207,George Springer,HOU@CLE 07/31/2019 07:10PM ET,HOU
210,Yordan Alvarez,HOU@CLE 07/31/2019 07:10PM ET,HOU
216,Rafael Devers,TB@BOS 07/31/2019 07:10PM ET,BOS
218,Francisco Lindor,HOU@CLE 07/31/2019 07:10PM ET,CLE


#### Match hitters to pitchers

In [52]:
matchups = pd.merge(starters, pitchers, on='Game Info')
matchups = matchups[matchups['TeamAbbrev_x'] != matchups['TeamAbbrev_y']]
matchups = matchups.drop(['Game Info', 'TeamAbbrev_x', 'TeamAbbrev_y'], axis=1)
matchups = matchups.rename(columns={'Name_x':'batter_name', 'Name_y':'pitcher_name'})
matchups.to_csv('matchups.csv')
matchups.head()

Unnamed: 0,batter_name,pitcher_name
1,Christian Yelich,Brett Anderson
3,Keston Hiura,Brett Anderson
4,Marcus Semien,Jordan Lyles
6,Mark Canha,Jordan Lyles
8,Matt Olson,Jordan Lyles
