.Import necessary packages

In [6]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np

In [7]:
# Scraping Data according to year month for the nba
def scrape_nba_stats(year, month):
    # this is the HTML from the given URL
    url = 'https://www.basketball-reference.com/leagues/NBA_'+ str(year) + '_games-'+ str(month) + '.html'


    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)


    # use findALL() to get the column headers
    soup.findAll('tr', limit=2)
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    header_Date = headers[0]
    headers_notDate = headers[1:]


    # avoid the first header row
    rows = soup.findAll('tr')[1:]
    schedule = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    schedule_date = [[th.getText() for th in rows[i].findAll('th')]
                for i in range(len(rows))]


    stats_nD = pd.DataFrame(schedule, columns = headers_notDate)
    len(stats_nD)

    stats_D = pd.DataFrame(schedule_date, columns = ['Date'])
    stats = stats_D.join(stats_nD)

    return stats

In [8]:
# Cleaning the scraped data
def clean_dataset(nba_data):
    # Clean Dataset
    nba_data.head(5)

    # Convert to Datetime
    nba_data['Date'] = pd.to_datetime(pd.to_datetime(nba_data['Date']))

    # Convert to military time
    nba_data['Start (ET)'] = [re.sub(r'p', '', str(x)) for x in nba_data['Start (ET)']]

    helper = nba_data['Start (ET)'].str.split(':', expand = True)

    hour = helper[0]
    minute = helper[1]

    # Concat back to format: 7:00 PM - to concat with Date of Game
    time_UTC = pd.DataFrame(hour + ':' + minute + ' PM', columns = ['Time']).fillna(value = '6:11 AM')
    time_UTC

    # Convert to time
    time_UTC['Start_time_24_ET'] = pd.to_datetime(time_UTC['Time']).dt.time

    # Join back to main DF
    nba_data = nba_data.join(time_UTC['Start_time_24_ET'])

    # Create Column with Date and Time (still Easter time: ET)
    nba_data['Date_Time_ET'] = pd.to_datetime(nba_data['Date'].astype(str)+' '+nba_data['Start_time_24_ET'].astype(str))

    # Add five hours in order to convert it to UTC time (tweepy also works with UTC)
    nba_data['Date_Time_UTC'] = nba_data['Date_Time_ET'] + pd.DateOffset(hours=5)

    # Drop unnecessary rows
    nba_data.drop(columns=['Start (ET)', 'Date_Time_ET', 'Start_time_24_ET', 'Date', 'Box_Score'], inplace = True)

    # replace OT with 0 and 1
    nba_data['OT'].loc[nba_data['OT'] == 'OT'] = 1
    nba_data['OT'].loc[nba_data['OT'] != 1] = 0
    #nba_data['OT'] = nba_data['OT'].replace(np.nan, 0)



    return nba_data


In [9]:
# Calling scraper function
data = scrape_nba_stats(2021, 'april')
data_m = scrape_nba_stats(2021, 'may')
nba_data = data.append(data_m, ignore_index= True)
nba_data.columns =  ['Date', 'Start (ET)', 'Visitor', 'PTS_V', 'Home','PTS_H', 'Box_Score', 'OT', 'Attend.', 'Notes']

# Calling cleaning function and save it as nba_data
nba_data_games = clean_dataset(nba_data)

# Safe nba_data as csv
nba_data_games.to_csv('C:/Users/Marc/Dropbox/06_ESCP/01_Uni/06_MA Thesis/04_Code/02_Output/01_NBA Scheudle & Stats/nba_data.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [10]:
nba_data_games

Unnamed: 0,Visitor,PTS_V,Home,PTS_H,OT,Attend.,Notes,Date_Time_UTC
0,Philadelphia 76ers,114,Cleveland Cavaliers,94,0,4100,,2021-04-02 00:00:00
1,Washington Wizards,91,Detroit Pistons,120,0,750,,2021-04-02 00:00:00
2,Charlotte Hornets,89,Brooklyn Nets,111,0,1773,,2021-04-02 00:30:00
3,Golden State Warriors,109,Miami Heat,116,0,0,,2021-04-02 01:00:00
4,Orlando Magic,115,New Orleans Pelicans,110,1,3700,,2021-04-02 01:00:00
...,...,...,...,...,...,...,...,...
408,Phoenix Suns,100,Los Angeles Lakers,92,0,8025,,2021-05-30 20:30:00
409,Brooklyn Nets,141,Boston Celtics,126,0,17226,,2021-05-31 00:00:00
410,Los Angeles Clippers,106,Dallas Mavericks,81,0,17761,,2021-05-31 02:30:00
411,Philadelphia 76ers,114,Washington Wizards,122,0,10665,,2021-06-01 00:00:00


# Problems Overcome:
1. Turn hours into military time, and adjust to UTC
    - Was especially difficult to add 5 hours to get from eastern time to UTC. Done by concatinating with Date, and then I was able to use dataoffset form pandas
1. Generally changing something for the complete column (take away p)
1. parse time correctly
- Replace empty times NaN

