In [1]:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import requests

In [5]:
# Firstly we need to collect all links that we intend to scrape from
# the data needed is in the format of:
# https://www.pro-football-reference.com/years/2018/week_21.htm
# We need to set up a dictionary containing all weeks and years: seasonDict

# First setup years and weeks we are interested in. Note that the
# postseason starts in week 18 and the superbowl is week 21
# We will collect dating back to 2004
yearNum = [year for year in range(2009,2019)]
weekNum = [week for week in range(1,18)]
extraWeeks = ['wildcard-weekend','divisional-playoffs','conf-championships','superbowl']
for week in extraWeeks:
    weekNum.append(week)
seasonDict = {i : [j for j in weekNum] for i in yearNum}
print(seasonDict[2016])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 'wildcard-weekend', 'divisional-playoffs', 'conf-championships', 'superbowl']


In [40]:
# Now we can build the links we need, start with empty list: seasonLinks
seasonLinks = []

# Now iterate through seasonDict and put together all needed links
# remeber the format of the links is: https://www.pro-football-reference.com/years/2018/week_21.htm
for year in seasonDict.keys():
    # the website has a different format for the year 2010 than other weeks, so we need to adjust accordingly
    # format: http://www.nflweather.com/en/week/2010/week-6-2/
    if year == 2010:
        for week in seasonDict[year]:
        # the non-integer weeks, such as superbowl week, are formatted differently
            if isinstance(week, int) == True:
                seasonLinks.append('http://www.nflweather.com/en/week/' + str(year) + '/week-' + str(week) + '-2/')
            else:
                seasonLinks.append('http://www.nflweather.com/en/week/' + str(year) + '/' + week + '-2/')
    else:
        for week in seasonDict[year]:
            # the non-integer weeks, such as superbowl week, are formatted differently
            if isinstance(week, int) == True:
                seasonLinks.append('http://www.nflweather.com/en/week/' + str(year) + '/week-' + str(week) + '/')
            else:
                seasonLinks.append('http://www.nflweather.com/en/week/' + str(year) + '/' + week + '/')

print(seasonLinks[:10])

['http://www.nflweather.com/en/week/2016/week-1/', 'http://www.nflweather.com/en/week/2016/week-2/', 'http://www.nflweather.com/en/week/2016/week-3/', 'http://www.nflweather.com/en/week/2016/week-4/', 'http://www.nflweather.com/en/week/2016/week-5/', 'http://www.nflweather.com/en/week/2016/week-6/', 'http://www.nflweather.com/en/week/2016/week-7/', 'http://www.nflweather.com/en/week/2016/week-8/', 'http://www.nflweather.com/en/week/2016/week-9/', 'http://www.nflweather.com/en/week/2016/week-10/']


In [41]:
# We need to setup a csv file with column names to hold our data
csvFile = open("./csvFiles/weatherData2.csv", 'w+')
writer = csv.writer(csvFile)
writer.writerow(('season','week','awayTeam','homeTeam','forecast','wind'))

45

In [42]:
# First, let's scrape data from one page

url = 'http://www.nflweather.com/en/week/2017/week-7/'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")


# Grab the stadium and date. Strip of leading and trailing whitespaces
awayTeam = soup.select("tbody tr td")[1].text.strip()
homeTeam = soup.select("tbody tr td")[5].text.strip()
forecast = soup.select("tbody tr td")[9].text.strip()
wind = soup.select("tbody tr td")[11].text.strip()

print(awayTeam)
print(homeTeam)
print(forecast)
print(wind)

Chiefs
Raiders
61f Partly Cloudy
4m SW


In [43]:
# Now loop through and write all collected data from this page into our csv file as one row:

for game in seasonLinks:
    url = game
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    
    # need to grab from every row in the table for each week
    allRows = soup.select("tbody tr")
    
    for i in range(len(allRows)):
        awayTeam = allRows[i].select('td')[1].text.strip()
        homeTeam = allRows[i].select('td')[5].text.strip()
        forecast = allRows[i].select('td')[9].text.strip()
        wind = allRows[i].select('td')[11].text.strip()

        writer.writerow((game,game,awayTeam,homeTeam,forecast,wind))

http://www.nflweather.com/en/week/2009/wildcard-weekend/
http://www.nflweather.com/en/week/2009/divisional-playoffs/
http://www.nflweather.com/en/week/2009/conf-championships/
http://www.nflweather.com/en/week/2009/superbowl/
http://www.nflweather.com/en/week/2010/week-1-2/
http://www.nflweather.com/en/week/2010/week-2-2/
http://www.nflweather.com/en/week/2010/week-3-2/
http://www.nflweather.com/en/week/2010/week-4-2/
http://www.nflweather.com/en/week/2010/week-5-2/
http://www.nflweather.com/en/week/2010/week-6-2/
http://www.nflweather.com/en/week/2010/week-7-2/
http://www.nflweather.com/en/week/2010/week-8-2/
http://www.nflweather.com/en/week/2010/week-9-2/
http://www.nflweather.com/en/week/2010/week-10-2/
http://www.nflweather.com/en/week/2010/week-11-2/
http://www.nflweather.com/en/week/2010/week-12-2/
http://www.nflweather.com/en/week/2010/week-13-2/
http://www.nflweather.com/en/week/2010/week-14-2/
http://www.nflweather.com/en/week/2010/week-15-2/
http://www.nflweather.com/en/week

In [63]:
# Final cleaned script: 

import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import requests

yearNum = [year for year in range(2009,2019)]
weekNum = [week for week in range(1,18)]
extraWeeks = ['wildcard-weekend','divisional-playoffs','conf-championships','superbowl']
for week in extraWeeks:
    weekNum.append(week)
seasonDict = {i : [j for j in weekNum] for i in yearNum}

seasonLinks = []

for year in seasonDict.keys():
    if year == 2010:
        for week in seasonDict[year]:
            if isinstance(week, int) == True:
                seasonLinks.append('http://www.nflweather.com/en/week/' + str(year) + '/week-' + str(week) + '-2/')
            else:
                seasonLinks.append('http://www.nflweather.com/en/week/' + str(year) + '/' + week + '-2/')
    else:
        for week in seasonDict[year]:
            # the non-integer weeks, such as superbowl week, are formatted differently
            if isinstance(week, int) == True:
                seasonLinks.append('http://www.nflweather.com/en/week/' + str(year) + '/week-' + str(week) + '/')
            else:
                seasonLinks.append('http://www.nflweather.com/en/week/' + str(year) + '/' + week + '/')

                
csvFile = open("./csvFiles/weatherData4.csv", 'w+')
writer = csv.writer(csvFile)
writer.writerow(('season','week','awayTeam','homeTeam','forecast','wind'))

for week in seasonLinks:
    url = week
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")

    allRows = soup.select("tbody tr")
    
    for i in range(len(allRows)):
        awayTeam = allRows[i].select('td')[1].text.strip()
        homeTeam = allRows[i].select('td')[5].text.strip()
        forecast = allRows[i].select('td')[9].text.strip()
        wind = allRows[i].select('td')[11].text.strip()

        writer.writerow((week,week,awayTeam,homeTeam,forecast,wind))