In [633]:
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

In [144]:
# Base url used for 'get_player_stats_url' function
url_test = 'https://www.hockey-reference.com/leagues'

In [151]:
# Test list created for 'get_player_stats_url' function
test_list1 = ["/NHL_2019_skaters","/NHL_2020-skaters","/NHL_2021_skaters"]

In [153]:
# Function to get player stats url based on passed in list of season years
def get_player_stats_url(list_):
    for i in list_:
        print(url_test + i)
test_list(test_list1)

https://www.hockey-reference.com/leagues/NHL_2019_skaters
https://www.hockey-reference.com/leagues/NHL_2020-skaters
https://www.hockey-reference.com/leagues/NHL_2021_skaters


In [158]:
url_2021_skater_stats = 'https://www.hockey-reference.com/leagues/NHL_2021_skaters.html'
response = requests.get(url)

In [155]:
response.status_code

200

In [9]:
page = response.text

In [10]:
soup = BeautifulSoup(page, "lxml")

In [11]:
soup

<!DOCTYPE html>
<html class="no-js" data-root="/home/hr/build" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport"/>
<link href="https://d2p3bygnnzw9w3.cloudfront.net/req/202109021" rel="dns-prefetch"/>
<!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
<script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://quantcast.mgr.consensu.org'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, '/choice.js')
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScript);
	
	function 

In [12]:
soup.prettify()



In [13]:
stats_table = soup.find(id="stats")

In [14]:
rows = [row for row in stats_table.find_all("tr")]

In [163]:
players = {}

for row in rows[2:]:
    class_ = row.get("class")
    if class_ is not None and class_[0] == "thead":
        continue
    items = row.find_all('td')
    name = items[0].text
    link = items[0].find('a')['href']
    players[name] = [link] + [i.text for i in items[1:]]
    
players

{'Justin Abdelkader': ['/players/a/abdelju01.html',
  '32',
  'DET',
  'LW',
  '49',
  '0',
  '3',
  '3',
  '-14',
  '25',
  '-0.9',
  '0',
  '0',
  '0',
  '0',
  '3',
  '0',
  '0',
  '40',
  '0.0',
  '565',
  '11:32',
  '26',
  '103',
  '38',
  '31',
  '55.1'],
 'Pontus Aberg': ['/players/a/abergpo01.html',
  '26',
  'TOR',
  'LW',
  '5',
  '0',
  '1',
  '1',
  '0',
  '0',
  '0.0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '4',
  '0.0',
  '44',
  '8:42',
  '1',
  '1',
  '0',
  '0',
  ''],
 'Vitaly Abramov': ['/players/a/abramvi01.html',
  '21',
  'OTT',
  'RW',
  '2',
  '1',
  '0',
  '1',
  '0',
  '2',
  '0.2',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '3',
  '33.3',
  '12',
  '5:47',
  '0',
  '0',
  '0',
  '0',
  ''],
 'Noel Acciari': ['/players/a/acciano01.html',
  '28',
  'FLA',
  'C',
  '66',
  '20',
  '7',
  '27',
  '2',
  '21',
  '2.9',
  '19',
  '0',
  '1',
  '5',
  '7',
  '0',
  '0',
  '108',
  '18.5',
  '1054',
  '15:58',
  '103',
  '121',
  '381',
  '414',


In [164]:
players_df = pd.DataFrame.from_dict(players, orient = "index")
players_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,25,...,0,40,0.0,565,11:32,26,103,38,31,55.1
Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,0,...,0,4,0.0,44,8:42,1,1,0,0,
Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,2,...,0,3,33.3,12,5:47,0,0,0,0,
Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,21,...,0,108,18.5,1054,15:58,103,121,381,414,47.9
Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,0,...,0,2,50.0,50,9:56,2,6,8,8,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Travis Zajac,/players/z/zajactr01.html,34,NJD,C,69,9,16,25,-12,28,...,2,72,12.5,1180,17:06,39,58,576,514,52.8
Mika Zibanejad,/players/z/zibanmi01.html,26,NYR,C,57,41,34,75,9,14,...,2,208,19.7,1233,21:38,49,51,592,612,49.2
Mats Zuccarello,/players/z/zuccama01.html,32,MIN,LW,65,15,22,37,-9,18,...,0,96,15.6,1036,15:56,36,13,9,5,64.3
Jason Zucker,/players/z/zuckeja01.html,28,PIT,LW,15,6,6,12,-1,2,...,0,36,16.7,265,17:40,3,18,0,1,0.0


In [191]:
table_headers = []
for i in rows[1].find_all('th')[2:]:
    table_headers.append(i.get('aria-label'))
table_headers

['Age at time of finale',
 'Tm',
 'Pos',
 'Games Played',
 'Goals',
 'Assists',
 'Points',
 'Plus/Minus',
 'Penalties in Minutes',
 'Point Shares',
 'Even Strength Goals',
 'Power Play Goals',
 'Short-Handed Goals',
 'Game-Winning Goals',
 'Even Strength Assists',
 'Power Play Assists',
 'Short-Handed Assists',
 'Shots',
 'Shooting Percentage',
 'Time on Ice',
 'Average Time on Ice',
 'Blocks',
 'Hits',
 'Faceoff Wins',
 'Faceoff Losses',
 'Faceoff Percentage']

In [192]:
table_headers.insert(0,"Link")

In [193]:
table_headers

['Link',
 'Age at time of finale',
 'Tm',
 'Pos',
 'Games Played',
 'Goals',
 'Assists',
 'Points',
 'Plus/Minus',
 'Penalties in Minutes',
 'Point Shares',
 'Even Strength Goals',
 'Power Play Goals',
 'Short-Handed Goals',
 'Game-Winning Goals',
 'Even Strength Assists',
 'Power Play Assists',
 'Short-Handed Assists',
 'Shots',
 'Shooting Percentage',
 'Time on Ice',
 'Average Time on Ice',
 'Blocks',
 'Hits',
 'Faceoff Wins',
 'Faceoff Losses',
 'Faceoff Percentage']

In [19]:
players_df.columns = table_headers

In [20]:
players_df.rename({'Age at time of finale': 'Age'}, axis=1, inplace = True)

In [60]:
url_skater_stats = 'https://www.hockey-reference.com/leagues/NHL_2021_skaters.html'
response = requests.get(url_skater_stats)
print(f"status code {response}")
page = response.text
soup = BeautifulSoup(page, "lxml")

status code <Response [200]>


In [725]:
stats_table = soup.find(id="stats")

rows = [row for row in stats_table.find_all("tr")]

players = {}

for row in rows[2:]:
    class_ = row.get("class")
    if class_ is not None and class_[0] == "thead":
        continue
    items = row.find_all('td')
    name = items[0].text
    link = items[0].find('a')['href']
    players[name] = [link] + [i.text for i in items[1:]]
    

df_skater_stats_2021 = pd.DataFrame.from_dict(players, orient = "index")

table_headers = []
for i in rows[1].find_all('th')[2:]:
    table_headers.append(i.get('aria-label'))
    
table_headers.insert(0,"Link")

df_skater_stats_2021.columns = table_headers

df_skater_stats_2021.rename({'Age at time of finale': 'Age'}, axis=1, inplace = True)

df_skater_stats_2021.reset_index(inplace = True)

df_skater_stats_2021.rename(columns={"index":"Player"}, inplace = True)

df_skater_stats_2021.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Short-Handed Assists,Shots,Shooting Percentage,Time on Ice,Average Time on Ice,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage
0,Vitaly Abramov,/players/a/abramvi01.html,22,OTT,RW,2,0,0,0,0,...,0,0,,19,9:43,0,0,0,0,
1,Noel Acciari,/players/a/acciano01.html,29,FLA,C,41,4,7,11,0,...,1,68,5.9,631,15:24,63,91,272,247,52.4
2,Calen Addison,/players/a/addisca01.html,20,MIN,D,3,0,0,0,0,...,0,4,0.0,56,18:31,1,2,0,0,
3,Kenny Agostino,/players/a/agostke01.html,28,TOR,LW,1,0,0,0,0,...,0,0,,4,4:08,0,0,0,0,
4,Andrew Agozzino,/players/a/agozzan01.html,30,ANA,LW,3,0,1,1,0,...,0,3,0.0,37,12:11,7,6,16,16,50.0


In [726]:
url_skater_stats = 'https://www.hockey-reference.com/leagues/NHL_2020_skaters.html'
response = requests.get(url_skater_stats)
print(f"status code {response}")
page = response.text
soup = BeautifulSoup(page, "lxml")

status code <Response [200]>


In [727]:
stats_table = soup.find(id="stats")

rows = [row for row in stats_table.find_all("tr")]

players = {}

for row in rows[2:]:
    class_ = row.get("class")
    if class_ is not None and class_[0] == "thead":
        continue
    items = row.find_all('td')
    name = items[0].text
    link = items[0].find('a')['href']
    players[name] = [link] + [i.text for i in items[1:]]
    

df_skater_stats_1920 = pd.DataFrame.from_dict(players, orient = "index")

table_headers = []
for i in rows[1].find_all('th')[2:]:
    table_headers.append(i.get('aria-label'))
    
table_headers.insert(0,"Link")

df_skater_stats_1920.columns = table_headers

df_skater_stats_1920.rename({'Age at time of finale': 'Age'}, axis=1, inplace = True)

df_skater_stats_1920.reset_index(inplace = True)

df_skater_stats_1920.rename(columns={"index":"Player"}, inplace = True)

df_skater_stats_1920.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Short-Handed Assists,Shots,Shooting Percentage,Time on Ice,Average Time on Ice,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage
0,Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,...,0,40,0.0,565,11:32,26,103,38,31,55.1
1,Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,...,0,4,0.0,44,8:42,1,1,0,0,
2,Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,...,0,3,33.3,12,5:47,0,0,0,0,
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,0,108,18.5,1054,15:58,103,121,381,414,47.9
4,Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,...,0,2,50.0,50,9:56,2,6,8,8,50.0


In [719]:
df_shifts_1920 = pd.read_csv('Shifts_1920.csv')
df_shifts_1920 = df_shifts_1920[["Unnamed: 2", "Unnamed: 5"]]
df_shifts_1920.drop([0,1], inplace = True)
df_shifts_1920.rename(columns={'Unnamed: 2': 'Player', 'Unnamed: 5': 'Avg Shift Length'}, inplace=True)

def name_conversion(name):
    if name is None or len(str(name).split("\\")) != 2:
        return None
    else:
        shoots = str(name).split("\\")
        return shoots[0]

df_shifts_1920["Player"] = df_shifts_1920["Player"].apply(name_conversion)

df_shifts_1920.reset_index(inplace = True)
del df_shifts_1920["index"]
df_shifts_1920.head()

Unnamed: 0,Player,Avg Shift Length
0,Justin Abdelkader,0:44
1,Pontus Aberg,0:50
2,Vitaly Abramov,0:31
3,Noel Acciari,0:51
4,Andrew Agozzino,0:35


In [728]:
df_skater_stats_1920 = df_skater_stats_1920.merge(df_shifts_1920,how = "left", on = "Player")

In [729]:
df_skater_stats_1920.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Shots,Shooting Percentage,Time on Ice,Average Time on Ice,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length
0,Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,...,40,0.0,565,11:32,26,103,38,31,55.1,0:44
1,Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,...,4,0.0,44,8:42,1,1,0,0,,0:50
2,Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,...,3,33.3,12,5:47,0,0,0,0,,0:31
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,108,18.5,1054,15:58,103,121,381,414,47.9,0:51
4,Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,...,2,50.0,50,9:56,2,6,8,8,50.0,0:35


In [720]:
df_shifts_2021 = pd.read_csv('Shifts_2021.csv')
df_shifts_2021 = df_shifts_2021[["Unnamed: 2", "Unnamed: 5"]]
df_shifts_2021.drop([0,1], inplace = True)
df_shifts_2021.rename(columns={'Unnamed: 2': 'Player', 'Unnamed: 5': 'Avg Shift Length'}, inplace=True)

def name_conversion(name):
    if name is None or len(str(name).split("\\")) != 2:
        return None
    else:
        shoots = str(name).split("\\")
        return shoots[0]

df_shifts_2021["Player"] = df_shifts_2021["Player"].apply(name_conversion)

df_shifts_2021.reset_index(inplace = True)
del df_shifts_2021["index"]
df_shifts_2021.head()

Unnamed: 0,Player,Avg Shift Length
0,Vitaly Abramov,0:46
1,Noel Acciari,0:32
2,Calen Addison,0:44
3,Kenny Agostino,0:43
4,Andrew Agozzino,0:53


In [730]:
df_skater_stats_2021 = df_skater_stats_2021.merge(df_shifts_2021,how = "left", on = "Player")

In [731]:
df_skater_stats_2021.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Shots,Shooting Percentage,Time on Ice,Average Time on Ice,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length
0,Vitaly Abramov,/players/a/abramvi01.html,22,OTT,RW,2,0,0,0,0,...,0,,19,9:43,0,0,0,0,,0:46
1,Noel Acciari,/players/a/acciano01.html,29,FLA,C,41,4,7,11,0,...,68,5.9,631,15:24,63,91,272,247,52.4,0:32
2,Calen Addison,/players/a/addisca01.html,20,MIN,D,3,0,0,0,0,...,4,0.0,56,18:31,1,2,0,0,,0:44
3,Kenny Agostino,/players/a/agostke01.html,28,TOR,LW,1,0,0,0,0,...,0,,4,4:08,0,0,0,0,,0:43
4,Andrew Agozzino,/players/a/agozzan01.html,30,ANA,LW,3,0,1,1,0,...,3,0.0,37,12:11,7,6,16,16,50.0,0:53


In [737]:
df_skater_stats_all = df_skater_stats_1920.append(df_skater_stats_2021)
df_skater_stats_all.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Shots,Shooting Percentage,Time on Ice,Average Time on Ice,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length
0,Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,...,40,0.0,565,11:32,26,103,38,31,55.1,0:44
1,Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,...,4,0.0,44,8:42,1,1,0,0,,0:50
2,Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,...,3,33.3,12,5:47,0,0,0,0,,0:31
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,108,18.5,1054,15:58,103,121,381,414,47.9,0:51
4,Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,...,2,50.0,50,9:56,2,6,8,8,50.0,0:35


In [738]:
df_skater_stats_all[df_skater_stats_all["Player"]=="Noel Acciari"]

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Shots,Shooting Percentage,Time on Ice,Average Time on Ice,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,108,18.5,1054,15:58,103,121,381,414,47.9,0:51
1,Noel Acciari,/players/a/acciano01.html,29,FLA,C,41,4,7,11,0,...,68,5.9,631,15:24,63,91,272,247,52.4,0:32


In [16]:
def get_player_details(link):
    base_url = "https://www.hockey-reference.com"
    url = base_url + link

    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    player_detail_dict = {e['itemprop'] : e.text.strip() for e in soup.find(id = "meta").find_all(attrs={'itemprop':True})}
    
    try:
        shoots = soup.find(id = "meta").find("strong", text = 'Shoots').next_sibling.strip()
        player_detail_dict["shoots"] = shoots
        return player_detail_dict
    except:
        player_detail_dict["shoots"] = None
        return player_detail_dict

In [74]:
player_details_list = []

for link in df_skater_stats_all["Link"].unique():
    player_details_list.append(get_player_details(link))
player_details_list

[{'name': 'Justin Abdelkader',
  'height': '6-2',
  'weight': '213lb',
  'birthDate': 'February 25,\xa01987',
  'birthPlace': 'in\xa0Muskegon,\xa0Michigan',
  'shoots': ': Left'},
 {'name': 'Pontus Aberg',
  'height': '6-0',
  'weight': '194lb',
  'affiliation': 'Ottawa Senators',
  'birthDate': 'September 23,\xa01993',
  'birthPlace': 'in\xa0Stockholm,\xa0Sweden',
  'shoots': ': Right'},
 {'name': 'Noel Acciari',
  'height': '5-10',
  'weight': '209lb',
  'affiliation': 'Florida Panthers',
  'birthDate': 'December 1,\xa01991',
  'birthPlace': 'in\xa0Johnston,\xa0Rhode Island',
  'shoots': ': Right'},
 {'name': 'Kenny Agostino',
  'height': '6-0',
  'weight': '199lb',
  'birthDate': 'April 30,\xa01992',
  'birthPlace': 'in\xa0Morristown,\xa0New Jersey',
  'shoots': ': Left'},
 {'name': 'Sebastian Aho',
  'height': '5-10',
  'weight': '184lb',
  'affiliation': 'New York Islanders',
  'birthDate': 'February 17,\xa01996',
  'birthPlace': 'in\xa0Umea,\xa0Sweden',
  'shoots': ': Left'},
 {'

In [75]:
df_player_details = pd.DataFrame(player_details_list)

In [76]:
df_player_details.head()

Unnamed: 0,name,height,weight,birthDate,birthPlace,shoots,affiliation,relatedTo,deathDate,deathPlace,additionalName
0,Justin Abdelkader,6-2,213lb,"February 25, 1987","in Muskegon, Michigan",: Left,,,,,
1,Pontus Aberg,6-0,194lb,"September 23, 1993","in Stockholm, Sweden",: Right,Ottawa Senators,,,,
2,Noel Acciari,5-10,209lb,"December 1, 1991","in Johnston, Rhode Island",: Right,Florida Panthers,,,,
3,Kenny Agostino,6-0,199lb,"April 30, 1992","in Morristown, New Jersey",: Left,,,,,
4,Sebastian Aho,5-10,184lb,"February 17, 1996","in Umea, Sweden",: Left,New York Islanders,,,,


In [77]:
df_player_details.drop(["birthDate","affiliation","relatedTo","deathDate","deathPlace","additionalName"], axis = 1, inplace = True)

In [78]:
df_player_details.head()

Unnamed: 0,name,height,weight,birthPlace,shoots
0,Justin Abdelkader,6-2,213lb,"in Muskegon, Michigan",: Left
1,Pontus Aberg,6-0,194lb,"in Stockholm, Sweden",: Right
2,Noel Acciari,5-10,209lb,"in Johnston, Rhode Island",: Right
3,Kenny Agostino,6-0,199lb,"in Morristown, New Jersey",: Left
4,Sebastian Aho,5-10,184lb,"in Umea, Sweden",: Left


In [79]:
df_player_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        1295 non-null   object
 1   height      1290 non-null   object
 2   weight      1290 non-null   object
 3   birthPlace  1295 non-null   object
 4   shoots      1286 non-null   object
dtypes: object(5)
memory usage: 50.7+ KB


In [28]:
df_player_details[df_player_details["weight"].isnull()]

Unnamed: 0,name,height,weight,birthPlace,shoots
2715,Michael Anderson,,,"in Fridley, Minnesota",: Left
2809,Jacob Bryson,,,"in London, Ontario",: Left
2973,Adam Fox,,,"in Jericho, New York",: Right


In [29]:
df_player_details[df_player_details["shoots"].isnull()]

Unnamed: 0,name,height,weight,birthPlace,shoots
178,Laurent Dauphin,6-1,189lb,"in Repentigny, Quebec",
1067,Laurent Dauphin,6-1,189lb,"in Repentigny, Quebec",
1973,Laurent Dauphin,6-1,189lb,"in Repentigny, Quebec",
2889,Jeremy Davies,5-11,180lb,"in Sainte-Anne-de-Bellevue, Quebec",
3037,Jani Hakanpaa,6-7,220lb,in Finland,
3366,Matthew Phillips,5-7,140lb,"in Calgary, Alberta",
3369,Brian Pinho,6-1,190lb,"in Beverly, Massachusetts",
3493,Logan Stanley,6-7,231lb,"in Kitchener, Ontario",
3553,Alexander True,6-5,200lb,in Denmark,


In [80]:
df_player_details.dropna(inplace = True)

In [81]:
df_player_details.shape

(1282, 5)

In [82]:
df_player_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1282 entries, 0 to 1294
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        1282 non-null   object
 1   height      1282 non-null   object
 2   weight      1282 non-null   object
 3   birthPlace  1282 non-null   object
 4   shoots      1282 non-null   object
dtypes: object(5)
memory usage: 60.1+ KB


In [83]:
df_player_details.rename(columns={"name":"Player"}, inplace = True)

In [84]:
df_player_details.head()

Unnamed: 0,Player,height,weight,birthPlace,shoots
0,Justin Abdelkader,6-2,213lb,"in Muskegon, Michigan",: Left
1,Pontus Aberg,6-0,194lb,"in Stockholm, Sweden",: Right
2,Noel Acciari,5-10,209lb,"in Johnston, Rhode Island",: Right
3,Kenny Agostino,6-0,199lb,"in Morristown, New Jersey",: Left
4,Sebastian Aho,5-10,184lb,"in Umea, Sweden",: Left


In [740]:
df_skater_stats_and_info = df_skater_stats_all.merge(df_player_details, how = "left", on = "Player")
df_skater_stats_and_info.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length,height,weight,birthPlace,shoots
0,Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,...,26,103,38,31,55.1,0:44,6-2,213lb,"in Muskegon, Michigan",: Left
1,Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,...,1,1,0,0,,0:50,6-0,194lb,"in Stockholm, Sweden",: Right
2,Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,...,0,0,0,0,,0:31,5-9,171lb,"in Chelyabinsk, Russian Federation",: Left
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,103,121,381,414,47.9,0:51,5-10,209lb,"in Johnston, Rhode Island",: Right
4,Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,...,2,6,8,8,50.0,0:35,5-10,187lb,"in Kleinburg, Ontario",: Left


In [335]:
def height_to_inches(height):
    if height == None or len(str(height).split("-")) == 1:
        return None
    else:
        height = str(height).split("-")
        height = float(height[0])*12 + float(height[1])
        return height

In [741]:
df_skater_stats_and_info["height"] = df_skater_stats_and_info.height.apply(height_to_inches)

In [742]:
df_skater_stats_and_info.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length,height,weight,birthPlace,shoots
0,Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,...,26,103,38,31,55.1,0:44,74.0,213lb,"in Muskegon, Michigan",: Left
1,Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,...,1,1,0,0,,0:50,72.0,194lb,"in Stockholm, Sweden",: Right
2,Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,...,0,0,0,0,,0:31,69.0,171lb,"in Chelyabinsk, Russian Federation",: Left
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,103,121,381,414,47.9,0:51,70.0,209lb,"in Johnston, Rhode Island",: Right
4,Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,...,2,6,8,8,50.0,0:35,70.0,187lb,"in Kleinburg, Ontario",: Left


In [744]:
def weight_conversion(weight):
    if weight is None or type(weight) == float:
        return None
    else:
        return weight[:3]

In [745]:
df_skater_stats_and_info.weight = df_skater_stats_and_info.weight.apply(weight_conversion)

In [746]:
df_skater_stats_and_info.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length,height,weight,birthPlace,shoots
0,Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,...,26,103,38,31,55.1,0:44,74.0,213,"in Muskegon, Michigan",: Left
1,Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,...,1,1,0,0,,0:50,72.0,194,"in Stockholm, Sweden",: Right
2,Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,...,0,0,0,0,,0:31,69.0,171,"in Chelyabinsk, Russian Federation",: Left
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,103,121,381,414,47.9,0:51,70.0,209,"in Johnston, Rhode Island",: Right
4,Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,...,2,6,8,8,50.0,0:35,70.0,187,"in Kleinburg, Ontario",: Left


In [747]:
def avg_play_time(time):
    if time is None:
        return None
    else:
        time = str(time).split(":")
        time = float(time[0]) + float(time[1])/60
        return time

In [748]:
df_skater_stats_and_info["Average Time on Ice"] = df_skater_stats_and_info["Average Time on Ice"].apply(avg_play_time)

In [749]:
df_skater_stats_and_info.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length,height,weight,birthPlace,shoots
0,Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,...,26,103,38,31,55.1,0:44,74.0,213,"in Muskegon, Michigan",: Left
1,Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,...,1,1,0,0,,0:50,72.0,194,"in Stockholm, Sweden",: Right
2,Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,...,0,0,0,0,,0:31,69.0,171,"in Chelyabinsk, Russian Federation",: Left
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,103,121,381,414,47.9,0:51,70.0,209,"in Johnston, Rhode Island",: Right
4,Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,...,2,6,8,8,50.0,0:35,70.0,187,"in Kleinburg, Ontario",: Left


In [750]:
def shoots_conversion(shoots):
    if shoots is None or len(str(shoots).split(":")) != 2:
        return None
    else:
        shoots = str(shoots).split(":")
        return shoots[1].strip()

In [751]:
df_skater_stats_and_info["shoots"] = df_skater_stats_and_info["shoots"].apply(shoots_conversion)

In [752]:
df_skater_stats_and_info.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length,height,weight,birthPlace,shoots
0,Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,...,26,103,38,31,55.1,0:44,74.0,213,"in Muskegon, Michigan",Left
1,Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,...,1,1,0,0,,0:50,72.0,194,"in Stockholm, Sweden",Right
2,Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,...,0,0,0,0,,0:31,69.0,171,"in Chelyabinsk, Russian Federation",Left
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,103,121,381,414,47.9,0:51,70.0,209,"in Johnston, Rhode Island",Right
4,Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,...,2,6,8,8,50.0,0:35,70.0,187,"in Kleinburg, Ontario",Left


In [760]:
def avg_shift_time(time):
    if time is None:
        return None
    else:
        time = str(time).split(":")
        time = float(time[0])*60 + float(time[1])
        return time

In [761]:
df_skater_stats_and_info["Avg Shift Length"] = df_skater_stats_and_info["Avg Shift Length"].apply(avg_shift_time)

In [762]:
df_skater_stats_and_info.head()

Unnamed: 0,Player,Link,Age,Tm,Pos,Games Played,Goals,Assists,Points,Plus/Minus,...,Blocks,Hits,Faceoff Wins,Faceoff Losses,Faceoff Percentage,Avg Shift Length,height,weight,birthPlace,shoots
0,Justin Abdelkader,/players/a/abdelju01.html,32,DET,LW,49,0,3,3,-14,...,26,103,38,31,55.1,44.0,74.0,213,"in Muskegon, Michigan",Left
1,Pontus Aberg,/players/a/abergpo01.html,26,TOR,LW,5,0,1,1,0,...,1,1,0,0,,50.0,72.0,194,"in Stockholm, Sweden",Right
2,Vitaly Abramov,/players/a/abramvi01.html,21,OTT,RW,2,1,0,1,0,...,0,0,0,0,,31.0,69.0,171,"in Chelyabinsk, Russian Federation",Left
3,Noel Acciari,/players/a/acciano01.html,28,FLA,C,66,20,7,27,2,...,103,121,381,414,47.9,51.0,70.0,209,"in Johnston, Rhode Island",Right
4,Andrew Agozzino,/players/a/agozzan01.html,29,ANA,LW,5,1,0,1,3,...,2,6,8,8,50.0,35.0,70.0,187,"in Kleinburg, Ontario",Left


In [754]:
# Drop players who had zero points in any of the last 4 seasons
df_skater_stats_and_info.Points = df_skater_stats_and_info.Points.astype(int)
df_skater_stats_and_info.drop(df_skater_stats_and_info[df_skater_stats_and_info.Points == 0].index, inplace = True)
df_skater_stats_and_info.shape

(1577, 33)

In [764]:
# Drop any row that had None values (only 12 rows dropped)
df_skater_stats_and_info.dropna(inplace = True)
df_skater_stats_and_info.shape

(1565, 33)

In [759]:
df_skater_stats_and_info.columns

Index(['Player', 'Link', 'Age', 'Tm', 'Pos', 'Games Played', 'Goals',
       'Assists', 'Points', 'Plus/Minus', 'Penalties in Minutes',
       'Point Shares', 'Even Strength Goals', 'Power Play Goals',
       'Short-Handed Goals', 'Game-Winning Goals', 'Even Strength Assists',
       'Power Play Assists', 'Short-Handed Assists', 'Shots',
       'Shooting Percentage', 'Time on Ice', 'Average Time on Ice', 'Blocks',
       'Hits', 'Faceoff Wins', 'Faceoff Losses', 'Faceoff Percentage',
       'Avg Shift Length', 'height', 'weight', 'birthPlace', 'shoots'],
      dtype='object')

In [636]:
df_skater_stats_and_info.to_csv("nhl_players_df.csv")

In [767]:
subset = ["Points","Age","Games Played", "Plus/Minus","Penalties in Minutes","Shots","Time on Ice","Average Time on Ice", "height","weight", "Blocks", "Hits"]

df_subset = df_skater_stats_and_info[subset]

df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1565 entries, 0 to 1798
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Points                1565 non-null   float64
 1   Age                   1565 non-null   float64
 2   Games Played          1565 non-null   float64
 3   Plus/Minus            1565 non-null   float64
 4   Penalties in Minutes  1565 non-null   float64
 5   Shots                 1565 non-null   float64
 6   Time on Ice           1565 non-null   float64
 7   Average Time on Ice   1565 non-null   float64
 8   height                1565 non-null   float64
 9   weight                1565 non-null   float64
 10  Blocks                1565 non-null   float64
 11  Hits                  1565 non-null   object 
dtypes: float64(11), object(1)
memory usage: 158.9+ KB


In [768]:
# convert columns used in subset to dtype float in source dataframe so I don't have to reconvert to float for every...
# ...different subset
for obj in df_subset:
    df_skater_stats_and_info[obj]=df_skater_stats_and_info[obj].astype(float)
df_skater_stats_and_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1565 entries, 0 to 1798
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Player                 1565 non-null   object 
 1   Link                   1565 non-null   object 
 2   Age                    1565 non-null   float64
 3   Tm                     1565 non-null   object 
 4   Pos                    1565 non-null   object 
 5   Games Played           1565 non-null   float64
 6   Goals                  1565 non-null   object 
 7   Assists                1565 non-null   object 
 8   Points                 1565 non-null   float64
 9   Plus/Minus             1565 non-null   float64
 10  Penalties in Minutes   1565 non-null   float64
 11  Point Shares           1565 non-null   object 
 12  Even Strength Goals    1565 non-null   object 
 13  Power Play Goals       1565 non-null   object 
 14  Short-Handed Goals     1565 non-null   object 
 15  Game

In [769]:
df_skater_stats_and_info.to_csv("nhl_players_df.csv")