In [57]:
#Import all necessary libraries
import requests
import os
import shutil
import pandas as pd
import numpy as np
import urllib.request
from pprint import pprint
from html_table_parser.parser import HTMLTableParser
from bs4 import BeautifulSoup

In [58]:
#Defining our year rage
years = list(range(1991,2023))

In [74]:
#Download all html pages from the MVP voting web pages
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

for year in years:
    url = url_start.format(year)
    
    data = requests.get(url)
    
    with open("MVP_Vote_List/mvp_{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)


In [63]:
#Extract the MVP voting tables from all our downloaded HTML pages

dfs = []
for year in years:
    with open("MVP_Vote_List/mvp_{}.html".format(year), encoding="utf-8") as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df["Year"] = year
    mvp_df["Won"] = np.where(mvp_df['First'] == max(mvp_df['First']), "Yes", "No")
    dfs.append(mvp_df)
    
mvps = pd.concat(dfs)

AttributeError: 'NoneType' object has no attribute 'decompose'

In [62]:
#Cleaning the Teams column with updated teams abbreviations
mvps = mvps.replace({'Tm' : {'PHO' : 'PHX', 'NJN' : 'BKN', 'CHH' : 'CHA', 'NOH':'NOP', 'BRK': 'BKN', 'WSB':'WAS'}})

NameError: name 'mvps' is not defined

In [16]:
#Load the data into a CSV file
mvps.to_csv("mvp_data.csv", sep='\t', encoding='utf-8')

In [None]:
#Download all html pages from the NBA standing web pages
url_origin = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

for year in years:
    team_url = url_origin.format(year)
    
    team_data = requests.get(team_url)
    
    with open("TeamRecord/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(team_data.text)

In [41]:
#Extract the MVP voting tables from all our downloaded HTML pages
dfs = []
for year in years:
    with open("TeamRecord/{}.html".format(year), encoding="utf-8") as f:
            page = f.read()

    soup = BeautifulSoup(page, 'html.parser')

    while len(soup.find_all('tr', class_="thead")) != 0:
            soup.find('tr', class_="thead").decompose()
    
    west_team_table = soup.find(id="divs_standings_W")
    west_team_df = pd.read_html(str(west_team_table))[0]
    west_team_df["Year"] = year
    west_team_df.rename(columns = {"Western Conference":"TeamName"}, inplace = True)
    dfs.append(west_team_df)
    
    east_team_table = soup.find(id="divs_standings_E")
    east_team_df = pd.read_html(str(east_team_table))[0]
    east_team_df["Year"] = year
    east_team_df.rename(columns = {"Eastern Conference":"TeamName"}, inplace = True)
    dfs.append(east_team_df)
    
nba_standings = pd.concat(dfs)

In [51]:
#Clean the TeamName column
nba_standings["TeamName"] = nba_standings["TeamName"].str.replace('*' , '')

  nba_standings["TeamName"] = nba_standings["TeamName"].str.replace('*' , '')


In [53]:
#Drop useless columns
nba_standings.drop(nba_standings.columns[[4, 5, 6,7]], axis=1, inplace=True)

In [55]:
#Load the data into a CSV file
nba_standings.to_csv("TeamRecord/standings_data.csv", sep='\t', encoding='utf-8')

In [21]:
#Wiki page of all nba abbreviation + their full name
url_wiki = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations"

In [22]:
#urllib function to extract table
def url_get_contents(url):

    req = urllib.request.Request(url=url)
    f = urllib.request.urlopen(req)

    return f.read()

In [23]:
page = url_get_contents(url_wiki).decode('utf-8')

In [24]:
parser = HTMLTableParser()

In [25]:
parser.feed(page)

In [27]:
pprint(parser.tables[0])

[['Abbreviation/ Acronym', 'Franchise'],
 ['ATL', 'Atlanta Hawks'],
 ['BKN', 'Brooklyn Nets'],
 ['BOS', 'Boston Celtics'],
 ['CHA', 'Charlotte Hornets'],
 ['CHI', 'Chicago Bulls'],
 ['CLE', 'Cleveland Cavaliers'],
 ['DAL', 'Dallas Mavericks'],
 ['DEN', 'Denver Nuggets'],
 ['DET', 'Detroit Pistons'],
 ['GSW', 'Golden State Warriors'],
 ['HOU', 'Houston Rockets'],
 ['IND', 'Indiana Pacers'],
 ['LAC', 'Los Angeles Clippers'],
 ['LAL', 'Los Angeles Lakers'],
 ['MEM', 'Memphis Grizzlies'],
 ['MIA', 'Miami Heat'],
 ['MIL', 'Milwaukee Bucks'],
 ['MIN', 'Minnesota Timberwolves'],
 ['NOP', 'New Orleans Pelicans'],
 ['NYK', 'New York Knicks'],
 ['OKC', 'Oklahoma City Thunder'],
 ['ORL', 'Orlando Magic'],
 ['PHI', 'Philadelphia 76ers'],
 ['PHX', 'Phoenix Suns'],
 ['POR', 'Portland Trail Blazers'],
 ['SAC', 'Sacramento Kings'],
 ['SAS', 'San Antonio Spurs'],
 ['TOR', 'Toronto Raptors'],
 ['UTA', 'Utah Jazz'],
 ['WAS', 'Washington Wizards']]


In [28]:
nba_teams = pd.DataFrame(parser.tables[0])

In [36]:
#Change column name to first row and delete the first row
nba_teams.columns = nba_teams.iloc[0]
nba_teams = nba_teams.drop(index=0)

In [39]:
#Load the data to CSV file
nba_teams.to_csv("TeamRecord/teams_data.csv", sep='\t', encoding='utf-8')