## Grabbing the Data
-----

### Setup:
---

In [1]:
# Import dependencies
from pathlib import Path
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
from pytz import timezone
import re

### Grab NFL Game Data from Past Years:
---

In [2]:
# helper function to pull text from the html element
# if error occurs - return 'N/A'
def getHTMLValue(input, parent=False):
    try:
        if parent:
            return input.parent.text
        else:
            return input.text
    except:
        return 'N/A'

In [3]:
# helper function to replace text in the results string.
# if error occurs - return the original value
def parseLabels(rawData, throwAway):
    try:
        return rawData.replace(throwAway, '')
    except:
        return rawData

In [4]:
# get the numerical month from string
def getMonthNumber(month_string):
    if (month_string == "September"):
        return 9
    if (month_string == "October"):
        return 10
    if (month_string == "November"):
        return 11
    if (month_string == "December"):
        return 12
    if (month_string == "January"):
        return 1

In [5]:
# need to get the pytz timezone
def getTimeZone(raw_timezone):
    if (raw_timezone == 'CDT' or raw_timezone == 'CST'):
        return 'America/Chicago'
    if (raw_timezone == 'EDT' or raw_timezone == 'EST'):
        return 'America/Detroit'
    if (raw_timezone == 'MST' or raw_timezone == 'MDT'):
        return 'America/Denver'
    if (raw_timezone == 'PDT' or raw_timezone == 'PST'):
        return 'America/Los_Angeles'

    # default - return America/Detroit
    return 'America/Detroit'    
    

In [6]:
# helper function to get standard kickoff time from date and time
def getKickoffTime(year, date, time):
    date_without_year = date.split(',')[0]
    date_parts = date_without_year.split(' ')
    month = getMonthNumber(date_parts[0].strip())
    day = int(date_parts[1])
    if month == 1:
        game_year = year + 1
    else:
        game_year = year
    
    time = time.replace('/', ' ')
    time = time.replace(u'\xa0', u' ')
    time_parts = time.split(' ')
    clock = time_parts[0]
    clock_parts = clock.split(":")
    hour = clock_parts[0]
    minute = clock_parts[1]
    if (time_parts[1] == 'p.m.' and int(hour) < 12):
        hour = int(hour) + 12
    
    if (len(time_parts) > 2):
        time_zone = getTimeZone(time_parts[2])
    else:
        time_zone = getTimeZone('')

    dt  = datetime.datetime(int(game_year),int(month),int(day),int(hour),int(minute))
    fromZone, toZone = timezone(time_zone), timezone('EST')

    standardized_time = fromZone.localize(dt).astimezone(toZone)

    return f'{standardized_time.hour}:{standardized_time.minute:02d}' 

In [7]:
# Set Range of Years
years_list = [2022]
years_list

[2022]

In [8]:
# Set List of Teams
teams_list = ["Arizona Cardinals",
    "Atlanta Falcons",
    "Baltimore Ravens",
    "Buffalo Bills",
    "Carolina Panthers",
    "Chicago Bears",
    "Cincinnati Bengals",
    "Cleveland Browns",
    "Dallas Cowboys",
    "Denver Broncos",
    "Detroit Lions",
    "Green Bay Packers",
    "Houston Texans",
    "Indianapolis Colts",
    "Jacksonville Jaguars",
    "Kansas City Chiefs",
    "Las Vegas Raiders",
    "Los Angeles Chargers",
    "Los Angeles Rams",
    "Miami Dolphins",
    "Minnesota Vikings",
    "New England Patriots",
    "New Orleans Saints",
    "New York Giants",
    "New York Jets",
    "Philadelphia Eagles",
    "Pittsburgh Steelers",
    "San Francisco 49ers",
    "Seattle Seahawks",
    "Tampa Bay Buccaneers",
    "Tennessee Titans",
    "Washington Commanders",
    ]

In [9]:
# Try With a Single Year (2022) First
for year in years_list:
    for team in teams_list:
        team_format = team.replace(' ', '_')
        url = f'https://en.wikipedia.org/wiki/{year}_{team_format}_season'

        wiki_tables = pd.read_html(url, match='Opponent')

        for table in wiki_tables:
            # if the table has more than 5 rows, that's our results table
            if (len(table) > 5):
                results_df = table

        results_df['team'] = team
        results_df['weather_condition'] = ''
        results_df['temp_f'] = ''
        results_df['time'] = ''
        results_df['city'] = ''
        results_df['state'] = ''
        results_df['year'] = year

        # screen scrape the game data to get the weather, time, and location
        team_response = requests.get(url)
        team_response_html = team_response.content.decode('utf-8')
        team_response_parsed = BeautifulSoup(team_response_html)

        uls = team_response_parsed.find_all('b', string="Date")
        for ul in uls:
            game_date = parseLabels(getHTMLValue(ul.parent), 'Date: ')
            if len(results_df.loc[results_df['Date'] == game_date]) > 0:
                print(team, game_date)
                game_weather = parseLabels(getHTMLValue(ul.parent.parent.find('b', string="Game weather"), parent=True), 'Game weather: ').replace(u'\xa0', u' ')
                
                temp_f = ''
                degrees = game_weather
                weather_without_temp = re.split("\d{1,2} °F", game_weather)
                
                for element in weather_without_temp:
                    degrees = degrees.replace(element, '')

                temp_f = degrees.replace('°F', '').strip()
                conditions = weather_without_temp[0].replace(',', '').strip()
                
                if (temp_f == ''):
                    temp_f = 72

                game_time = parseLabels(getHTMLValue(ul.parent.parent.find('b', string="Game time"), parent=True), 'Game time: ')
                game_location = parseLabels(getHTMLValue(ul.parent.parent.parent.find('p')), 'at ').replace('\n', '')

                game_location_parts = game_location.split(',')
                results_df.loc[results_df['Date'] == game_date, 'weather_condition'] = conditions
                results_df.loc[results_df['Date'] == game_date, 'temp_f'] = int(temp_f)
                results_df.loc[results_df['Date'] == game_date, 'time'] = getKickoffTime(year, game_date, game_time)
                results_df.loc[results_df['Date'] == game_date, 'city'] = game_location_parts[1].strip()
                results_df.loc[results_df['Date'] == game_date, 'state'] = game_location_parts[2].strip()

        # export to csv here - by year and team
        results_df.to_csv(f'../03-Wrangling_Data/Grabbing_Data_Exports/{year}_{team_format}.csv')


Arizona Cardinals September 11
Arizona Cardinals September 18
Arizona Cardinals September 25
Arizona Cardinals October 2
Arizona Cardinals October 9
Arizona Cardinals October 16
Arizona Cardinals October 20
Arizona Cardinals October 30
Arizona Cardinals November 6
Arizona Cardinals November 13
Arizona Cardinals November 21
Arizona Cardinals November 27
Arizona Cardinals December 12
Arizona Cardinals December 18
Arizona Cardinals December 25
Arizona Cardinals January 1
Arizona Cardinals January 8
Atlanta Falcons September 11
Atlanta Falcons September 18
Atlanta Falcons September 25
Atlanta Falcons October 2
Atlanta Falcons October 9
Atlanta Falcons October 16
Atlanta Falcons October 23
Atlanta Falcons October 30
Atlanta Falcons November 6
Atlanta Falcons November 10
Atlanta Falcons November 20
Atlanta Falcons November 27
Atlanta Falcons December 4
Atlanta Falcons December 18
Atlanta Falcons December 24
Atlanta Falcons January 1
Atlanta Falcons January 8
Baltimore Ravens September 11
Bal

### Export Past Game Data as CSV Files:
---

In [None]:
# Export CSV's for Each Team to ../03-Wrangling_Data/Grabbing_Data_Exports
