In [36]:
# Importing all the required libraries
import time
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

In [37]:
# The years for which we are going to scrape the mvp data
years = list(range(1991, 2023))

# MVP Votings Data

In [None]:
# The url from which we are going to scrape the mvp data
# {} in the url will let us replace it with the year we want the data for
mvp_stats_url = "https://www.basketball-reference.com/awards/awards_{}.html"

In [None]:
# This will iterate through all the years and replace the {} with the years
for year in years:
    url = mvp_stats_url.format(year)
    data = requests.get(url) # This will download the html page
    
    with open(f"mvp/{year}.html", "w+") as f: # Saving each year data into a different file so that we don't have to download the data everytime we make a request
        f.write(data.text)

In [None]:
# Reading the html file
with open("mvp/1991.html") as f:
    page = f.read()

In [None]:
# Parsing the html page
soup = BeautifulSoup(page, "html.parser")

In [None]:
# Deleting the over header as one header will be enough
soup.find("tr", class_="over_header").decompose()

In [None]:
# Finding the table we need
mvp_table = soup.find(id="mvp")

In [None]:
# Loading the table into pandas
mvp_1991 = pd.read_html(str(mvp_table))[0] 

In [None]:
dfs = []
for year in years:
    # Reading the html file
    with open(f"mvp/{year}.html") as f:
        page = f.read()
    # Parsing the html page
    soup = BeautifulSoup(page, "html.parser")
    # Deleting the over header as one header will be enough
    soup.find("tr", class_="over_header").decompose()
    # Finding the table we need
    mvp_table = soup.find(id="mvp")
    # Loading the table into pandas
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year # To know which year the data is from
    
    dfs.append(mvp) # Adding each years mvp table into the list

In [None]:
mvps = pd.concat(dfs)

In [None]:
mvps

In [None]:
mvps.to_csv("mvps.csv") # Storing the dataframe in a csv file so that it is easier to access

# All Players Statistics

To predict who the mvp will be just the mvps data will not be sufficient. We have to know what the statistics are for all the players in that season. So that at the end of the season we can compare all the players statistics and see if they are mvp worthy or not. So, now we need the data of all the players from the year 1991 to 2022 and then map this data with the mvp data we already have. 

Using playwright the players statistics have been donwloaded to the players folder. The script for getting that data is written in the "players_data.py" file.

In [24]:
dfs = []
for year in years:
    with open(f"player/{year}.html") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find("tr", class_="thead").decompose()
    player_table = soup.find(id="per_game_stats")
    player = pd.read_html(str(player_table))[0]
    player["Year"] = year
    dfs.append(player)

  o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)


In [32]:
players = pd.concat(dfs)

In [34]:
players

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836,601,Thaddeus Young,PF,33,TOR,26,0,18.3,2.6,5.5,...,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,2022
837,602,Trae Young,PG,23,ATL,76,76,34.9,9.4,20.3,...,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2022
838,603,Omer Yurtseven,C,23,MIA,56,12,12.6,2.3,4.4,...,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,2022
839,604,Cody Zeller,C,29,POR,27,0,13.1,1.9,3.3,...,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,2022


In [35]:
players.to_csv("players.csv")

# Team Statistics

In the process of mvp voting team record is a very important factor. So when we want to predict the mvp we should make sure the team record is inserted into our data.

In [38]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [39]:
# Getting the division statistics because the other standing tables need selenium and these tables can be extracted with requests
for year in years:
    url = team_stats_url.format(year)

    data = requests.get(url)

    with open(f"team/{year}.html", "w+") as f:
        f.write(data.text)

In [43]:
dfs = []

for year in years:
    with open(f"team/{year}.html") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find("tr", class_="thead").decompose()
    team_table = soup.find(id="divs_standings_E")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Eastern Conference"]
    del team["Eastern Conference"]
    dfs.append(team)

    soup = BeautifulSoup(page, "html.parser")
    soup.find("tr", class_="thead").decompose()
    team_table = soup.find(id="divs_standings_W")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Western Conference"]
    del team["Western Conference"]
    dfs.append(team)

In [44]:
teams = pd.concat(dfs)

In [45]:
teams

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets
...,...,...,...,...,...,...,...,...,...
13,56,26,.683,—,115.6,109.9,5.37,2022,Memphis Grizzlies* (2)
14,52,30,.634,4.0,108.0,104.7,3.12,2022,Dallas Mavericks* (4)
15,36,46,.439,20.0,109.3,110.3,-0.84,2022,New Orleans Pelicans* (9)
16,34,48,.415,22.0,113.2,113.0,0.02,2022,San Antonio Spurs (10)


In [46]:
teams.to_csv("teams.csv")