# Who is going to be the MVP for 2023?

### Using public data, predict the next MVP of the National Hockey League (NHL)
#### Why?
Incentives & Cash Awards  
Reapplied to Multiple Sports  
Coaching Analysis  


In [1]:
import requests
import os
import shutil
import time
import numpy as np

In [4]:
years = list(range(2012,2023)) #creates a list of years to cycle through in for loop below

## Getting the MVP Voting Data

In [5]:
url_start = "https://www.hockey-reference.com/awards/voting-{}.html"

for year in years:
    url = url_start.format(year) #replaces bracket with a specific year
    
    data = requests.get(url) #requests data from the awards page
    lag = np.random.uniform(low=2,high=15)
    time.sleep(lag)

    # save data to a mvp folder
    with open("hockey_mvp/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)


# Passing the votes table with BeautifulSoup

In [6]:
from bs4 import BeautifulSoup

In [7]:
# open a single page (EXAMPLE)
with open("hockey_mvp/2012.html", encoding="utf-8") as f:
    page = f.read()
    
soup = BeautifulSoup(page, 'html.parser') # Initialize Beautiful Soup class
soup.find('tr', class_="over_header").decompose() #remove extra header row

In [9]:
mvp_table = soup.find_all(id="hart_stats")[0] #create a table to read into pandas

In [10]:
import pandas as pd

In [11]:
mvp_2012 = pd.read_html(str(mvp_table))[0] #convert to string and read into pandas

In [12]:
mvp_2012.head(1) #display single row

Unnamed: 0,Place,Player,Age,Tm,Pos,Votes,Vote%,1st,2nd,3rd,...,+/-,W,L,T/O,GAA,SV%,OPS,DPS,GPS,PS
0,1,Evgeni Malkin,25,PIT,C,1473,98.86,144,4,1,...,18,,,,,,13.4,2.3,0.0,15.7


In [13]:
mvp_2012["Year"] = 2012

In [14]:
mvp_2012.head()

Unnamed: 0,Place,Player,Age,Tm,Pos,Votes,Vote%,1st,2nd,3rd,...,W,L,T/O,GAA,SV%,OPS,DPS,GPS,PS,Year
0,1,Evgeni Malkin,25,PIT,C,1473,98.86,144,4,1,...,,,,,,13.4,2.3,0.0,15.7,2012
1,2,Steven Stamkos,21,TBL,C,598,40.13,1,54,24,...,,,,,,12.8,1.9,0.0,14.6,2012
2,3,Henrik Lundqvist,29,NYR,G,556,37.32,3,35,36,...,39.0,18.0,5.0,1.97,0.93,0.0,0.0,14.1,14.1,2012
3,4,Claude Giroux,24,PHI,RW,458,30.74,0,25,34,...,,,,,,8.9,1.6,0.0,10.6,2012
4,5,Jonathan Quick,26,LAK,G,357,23.96,1,22,23,...,35.0,21.0,13.0,1.95,0.929,0.0,0.0,14.7,14.7,2012


In [16]:
dfs = [] #empty list

# cycling through years
for year in years:
    # parsing the data
    with open("hockey_mvp/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()
    
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="hart_stats")[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df["Year"] = year
    dfs.append(mvp_df)


In [139]:
# print(page)

In [141]:
soup = BeautifulSoup(page, 'html.parser')
# print(soup)


# Combing MVP Votes with Pandas

In [19]:
mvps = pd.concat(dfs) #stacking all the data frames into a single dataframe


In [20]:
mvps.tail() #display end of dataframe

Unnamed: 0,Place,Player,Age,Tm,Pos,Votes,Vote%,1st,2nd,3rd,...,W,L,T/O,GAA,SV%,OPS,DPS,GPS,PS,Year
10,11,Steven Stamkos,31,TBL,C,6,0.31,0,0,0,...,,,,,,10.1,2.4,0.0,12.5,2022
11,12,Aleksander Barkov,26,FLA,C,4,0.21,0,0,0,...,,,,,,8.4,2.2,0.0,10.6,2022
12,13,Jason Robertson,22,DAL,LW,3,0.15,0,0,0,...,,,,,,7.7,2.4,0.0,10.1,2022
13,14,J.T. Miller,28,VAN,C,1,0.05,0,0,0,...,,,,,,8.1,2.5,0.0,10.6,2022
14,14,Matthew Tkachuk,24,CGY,RW,1,0.05,0,0,0,...,,,,,,10.1,3.5,0.0,13.7,2022


In [21]:
mvps.to_csv("mvps.csv") # store dataframe to csv file

## Getting the Player Data

In [22]:
# Getting all the player data

# per game states
player_stats_url = "https://www.hockey-reference.com/leagues/NHL_{}_skaters.html"

# cycling through years
for year in years:
    url = player_stats_url.format(year) 
    
    data = requests.get(url) #downloading stats
    lag = np.random.uniform(low=2,high=15)
    time.sleep(lag)

    with open("hockey_player/{}.html".format(year), "w+", encoding='utf-8') as f:
        f.write(data.text)

# Using Selenium ot Scrape a Javascript Page

In [23]:
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

# install selenium chrome driver from https://chromedriver.chromium.org/downloads
# YouTube Video - https://www.youtube.com/watch?v=dz59GsdvUF8
# xattr -d com.apple.quarantine chromedriver

In [24]:
# chromedriver = "/Webdrivers/"
# os.environ["webdriver.chrome.webdriver"] = chromedriver
# driver = webdriver.Chrome(chromedriver)
# driver.get("http://stackoverflow.com")
# driver = webdriver.Chrome(executable_path="C:/Webdrivers/")
driver = webdriver.Chrome(executable_path=r"C:\Webdrivers\chromedriver.exe")

  driver = webdriver.Chrome(executable_path=r"C:\Webdrivers\chromedriver.exe")


In [None]:
# import time 

# year = 1991
# url  = player_stats_url.format(year)

# driver.get(url)
# driver.execute_script("windows.scrollTo(1,10000)")
# time.sleep(2)

# html = driver.page_source


In [25]:
# Cycling through all years and writing them to the chromedriver
for year in years:
    # Writing to a Datafile
    url = player_stats_url.format(year)
    
    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)
    
    with open("hockey_player/{}.html".format(year), "w+", encoding='utf-8') as f:
        f.write(driver.page_source)


# Passing Stats with Beautiful Soup

In [26]:
# We want to get the table but remove the table headers
dfs = []
for year in years:
    # open player files
    with open("hockey_player/{}.html".format(year), encoding='utf-8') as f:
        page = f.read() # files are assigned to page
    
    soup = BeautifulSoup(page, 'html.parser') # initialize BS
    soup.find('tr', class_="thead").decompose() # remove 'tr' header rows
    player_table = soup.find_all(id="stats")[0] # create dataframe
    player_df = pd.read_html(str(player_table))[0] # read dataframe to pandas
    player_df["Year"] = year
    dfs.append(player_df)


# Combining Player Stats with Pandas

In [27]:
players = pd.concat(dfs) # combining all dataframes to a single

In [28]:
players.head() 

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Scoring,Scoring,Scoring,Scoring,Scoring,...,Shot Data,Shot Data,Ice Time,Ice Time,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Year
Unnamed: 0_level_1,Rk,Player,Age,Tm,Pos,GP,G,A,PTS,+/-,...,S,S%,TOI,ATOI,BLK,HIT,FOW,FOL,FO%,Unnamed: 21_level_1
0,1,Justin Abdelkader,24,DET,LW,81,8,14,22,4,...,121,6.6,997,12:19,42,148,239,213,52.9,2012
1,2,Luke Adam,21,BUF,LW,52,10,10,20,-6,...,89,11.2,645,12:24,15,26,114,145,44.0,2012
2,3,Craig Adams,34,PIT,RW,82,5,13,18,-6,...,76,6.6,925,11:17,45,162,132,160,45.2,2012
3,4,Andrew Alberts,30,VAN,D,44,2,1,3,4,...,19,10.5,629,14:18,39,91,0,0,,2012
4,5,Daniel Alfredsson*,39,OTT,RW,75,27,32,59,16,...,191,14.1,1421,18:57,31,52,20,26,43.5,2012


In [29]:
players.to_csv("players.csv") # write to a csv

# Downloading Team Data

In [89]:
# grabbing division standings
team_stats_url = "https://www.hockey-reference.com/leagues/NHL_{}_standings.html"

In [90]:
for year in years:
    url = team_stats_url.format(year)
    
    data = requests.get(url) #downloading stats
    lag = np.random.uniform(low=2,high=15)
    time.sleep(lag)
    
    with open("hockey_team/{}.html".format(year), "w+", encoding='utf-8') as f:
        f.write(data.text)

## Parsing team data with Beautiful Soup

In [132]:
dfs = []

years2 = list(range(2014,2020)) #creates a list of years to cycle through in for loop below
years3 = list(range(2022,2023))
for year in years2 + years3:
    # open player files
    with open("hockey_team/{}.html".format(year), encoding='utf-8') as f:
        page = f.read() # files are assigned to page
    
    soup = BeautifulSoup(page, 'html.parser') # initialize BS
    soup.find('tr', class_="thead").decompose() # remove 'tr' header rows
    e_table = soup.find_all(id="standings_EAS")[0] # create dataframe
    e_df = pd.read_html(str(e_table))[0] # write to pandas
    e_df["Year"] = year #adding column
    # e_df[" "] = e_df["Eastern Conference"] #replacing 'Team' column with 'Eastern Conference' column
    # del e_df["Eastern Conference"] # deleting column
    dfs.append(e_df)

    e_table = soup.find_all(id="standings_WES")[0] # create dataframe
    e_df = pd.read_html(str(e_table))[0] # write to pandas
    e_df["Year"] = year #adding column
    # e_df[" "] = e_df["Western Conference"] #replacing 'Team' column with 'Eastern Conference' column
    # del e_df["Western Conference"] # deleting column
    dfs.append(e_df)
    

In [129]:
teams = pd.concat(dfs) # combining all dataframes to one

In [135]:
teams.rename(columns={'Unnamed: 0' : 'Team'}, inplace=True)


In [136]:
teams.tail()

Unnamed: 0,Team,GP,W,L,OL,PTS,PTS%,GF,GA,SRS,SOS,RPt%,ROW,RgRec,RgPt%,Year,RW
13,Vegas Golden Knights,82,43,31,8,94,0.573,266,248,0.21,-0.01,0.524,,34-31-17,0.518,2022,34
14,Vancouver Canucks,82,40,30,12,92,0.561,249,236,0.16,0.0,0.5,,32-30-20,0.512,2022,32
15,San Jose Sharks,82,32,37,13,77,0.47,214,264,-0.58,0.03,0.39,,22-37-23,0.409,2022,22
16,Anaheim Ducks,82,31,37,14,76,0.463,232,271,-0.45,0.03,0.384,,22-37-23,0.409,2022,22
17,Seattle Kraken,82,27,49,6,60,0.366,216,285,-0.8,0.04,0.317,,23-49-10,0.341,2022,23


In [137]:
teams.head()

Unnamed: 0,Team,GP,W,L,OL,PTS,PTS%,GF,GA,SRS,SOS,RPt%,ROW,RgRec,RgPt%,Year,RW
0,Boston Bruins*,82,54,19,9,117,0.713,261,177,0.92,-0.11,0.677,51,47-19-16,0.671,2014,
1,Tampa Bay Lightning*,82,46,27,9,101,0.616,240,215,0.25,-0.06,0.549,38,32-27-23,0.53,2014,
2,Montreal Canadiens*,82,46,28,8,100,0.61,215,204,0.08,-0.05,0.543,40,33-28-21,0.53,2014,
3,Detroit Red Wings*,82,39,28,15,93,0.567,222,230,-0.14,-0.04,0.5,34,30-28-24,0.512,2014,
4,Ottawa Senators,82,37,31,14,88,0.537,236,265,-0.37,-0.02,0.451,30,27-31-24,0.476,2014,


In [138]:
teams.to_csv("teams.csv") # write to csv