In [1]:
import pandas
import numpy
import requests
import bs4
from bs4 import BeautifulSoup
from pdb import set_trace
from collections import Counter


In [2]:
# example completed url. Note additition of '_' between sections: https://en.wikipedia.org/wiki/2000_Arizona_Cardinals_season
urlFront = "https://en.wikipedia.org/wiki/"
urlBack = "_season"


# when constructiing the url using the cities and team names, note that all spaces or hyphens need to be replaced by
# underscores in the url
# also, some teams, like the rams, have moved cities. As a result we are going to have the corresponding value for those keys
# be a list of their cities and we will surround the query with a try/catch statement
NAMETOCITY = {'Cardinals' : 'Arizona', 'Falcons' : 'Atlanta', 'Ravens' : 'Baltimore', 'Bills' : 'Buffalo', 'Panthers' : 'Carolina', 'Bears' : 'Chicago',
               'Bengals' : 'Cincinnati', 'Browns' : 'Cleveland', 'Cowboys' : 'Dallas', 'Broncos' : 'Denver', 'Lions' : 'Detroit', 'Packers' : 'Green-Bay', 
               'Texans' : 'Houston', 'Colts' : 'Indianapolis', 'Jaguars' : 'Jacksonville', 'Chiefs' : 'Kansas-City', 'Dolphins' : 'Miami', 'Vikings' : 'Minnesota',
               'Patriots' : 'New-England', 'Saints' : 'New-Orleans', 'Giants' : 'New-York', 'Jets' : 'New-York', 'Raiders' : ['Oakland', 'Las-Vegas'], 'Eagles' : 'Philadelphia',
               'Steelers' : 'Pittsburgh', 'Chargers' : ['San-Diego', 'Los-Angeles'], '49ers' : 'San-Francisco', 'Seahawks' : 'Seattle', 'Rams' : ['St.-Louis','Los-Angeles'], 'Buccaneers' : 'Tampa-bay',
               'Titans' : 'Tennessee', 'Redskins' : 'Washington'}

# Some people have 2 last names, like Kyle Vanden Bosch, the DE on the IR for the 2001 Arizona Cardinals. 
# Because of that, when I split the text by spaces and assume that the players position is the 4th element in the list
# I would find that his position is "Bosch" which obviously it isn't. It should be DE. As a result we are forming a list of 
# 'eligibile' positions. It will print out any unexpected positions so whoever is running the code can verify that no mistakes
# are made. For instance, the list below may not include "athlete" as a valid position, but upon entering the league there
# was a player named... denard robinson I think? From Michigan? who was best described as an 'athlete'. In the event that
# he is listed as such on wikipedia at any point, it will print out a list of position types that may have been overlooked
# and the user can adjust the code as necessary. 
validPositions = ["QB", "RB", "FB", "HB", 'H-B', "WR", "TE", "T", "LT", "RT", "G", "LG", "RG", "C", "OL", "DL", "NT", 
                  "DT", "DE", "RE", "LE", "OLB", "ROLB", "LOLB", "MLB", "LB", "CB", "FS", "SS", "S", "K", "P", 'H',
                 "RS", "KR", "PR", "LS", "ILB", "OT"]
# return specialist and long snapper are the two in that list that I feel might not be intuitive. 

validPositionGroups = ['quarterbacks', 'running backs', 'wide receivers', 'tight ends', 'offensive linemen', 
                      'defensive linemen', 'linebackers', 'defensive backs', 'special teams', 
                       'reserve list', 'practice squad', 'reserve lists', 'unsigned draft picks', 
                       'active/reserve lists', 'injured reserve', 'active/physically unable to perform',
                      'active/non-football injury', 'suspended', 'unrestricted free agents', 'restricted fas', 
                      'unrestricted fas', 'exclusive-rights fas']
# I added unsigned draft picks because some teams wiki pages, like the 2007 ravens, have multiple roster tables
# including preseason. So I am going to take that table down and then reassign unsigned draft picks to something else
coachingGroups = ['Head Coaches', 'Head coaches', 'Front Office', 'Front office', 'Strength and Conditioning',
                 'Strength and conditioning', 'Special Teams Coaches', 'Special teams coaches', 'Offensive Coaches', 
                  'Offensive coaches', 'Defensive Coaches', 'Defensive coaches']


# gets the HTML of the wiki page for the given team and year
def getSoup(teamName, city,  year):
    city = city.replace("-", '_')
    urlWhole = urlFront + '_' + str(year) + '_' + city + '_' + teamName + '_season'
    page = requests.get(urlWhole, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
    soup = BeautifulSoup(page.content, "html5lib")
    return soup
    
    
def getRoster(soup, city, teamName, year):
    # tempDF will store the roster for the current season for the current team and will be appended to the masterDF
    tempDF = pandas.DataFrame(columns=["Year", "City", "TeamName", "PositionGroup", "FirstName", "LastName", "JerseyNumber",
                                       "RosterStatus", "PositionOne", "PositionTwo", "PositionThree",
                                      'NameID', "NumberNameID", "ID"])
    
    # one of the teams (I forget which) had an issue where players were bolded if they were rookies and it messed everything
    # up. This line fixes it. Admittedly, I think it is fixed in a more elegant manner below because the problem is more 
    # widespread but I know this code works so I am not going to change it.
    if 'players in bold' in str(soup): return tempDF 
    
    # due to my lack of familiarity with the HTML language going into this project, I forgot the <b> tag just meant bold.
    # I was thinking it signified some sort of header in the tables. This lead to plenty of problems and I ended up trying
    # to force a bad solution (which works now!) but I'm sure this will serve as a reminder that a quick google search 
    # could have saved me countless hours spent debugging. To the point though, soup.find('b') just finds the first 'section'
    # in the table. Quarterbacks, specifically. 
    currNode = soup.find("b")
    positionGroup = currNode.text

    # while the current node is not starting a new table or is not h2 or h3, keep finding players
    # another instance where I did not fully understand what h2 and h3 signified. If this code needs to be modified down the
    # road (by me or anyone else from the NFL data community) I would probably do a better job ensuring that the soup
    # passed as an argument ends when you expect it to. The below line of code is certainly not the best way to do this.
    while (currNode.name != "table") and (currNode.name != "h2") and (currNode.name != "h3"):
        currNode = currNode.next
        # if the current node is a header... (again, b doesnt really mean that)
        if currNode.name == 'b':
            # this is the second work around for players names being bolded, and I think it makes the 'if players in bold... 
            # return' line obsolete. But I am keeping it anyway because it works and efficiency isn't my biggest concern
            # for this particular assignment. The below if/else statement accounts for when players names are bolded. 
            # since the players names are descendants (some nth degree of children) of something in the 'li' family, 
            # we can use that information to double check that whatever is bolded is a new position type and not a player
            if (currNode.parent.parent.name == 'li') or (currNode.parent.name == 'li'):
                pass
            # this line is because the panthers had a bold '(C)' to denote captain status. I think the above if statement
            # makes the else obsolete but I had this part first so I am keeping it. But at the end of the day, it 
            # is there to recognize when the position group changes from Quarterbacks to Running backs etc
            elif (currNode.text != '(C)'): 
                positionGroup = currNode.text.rstrip() #rstrip because some teams typed 'linebackers ' with a trailing space
                if positionGroup == 'Quarterbacks\'': # the jets did a possesive QBs one year for some reason
                    positionGroup = 'Quarterbacks'
        # if the current node is a List Item (like a row in the table. A player, in this case.)
        elif (currNode.name == 'li'): # that signifies we hit a new player
            # there were some empty rows that resulted in index out of bounds errors. This catches that.
            if 'class' in currNode.attrs:
                if 'mw-empty-elt' in currNode['class']:
                    continue
            
            # the text could look something like '9 Nick Foles QB'. Splitting it by spaces breaks it down into the parts we want
            # as an Eagles fan, Foles will always be #9 in my eyes.
            txt = currNode.text
            txt = txt.split()
            number = txt[0]
            if len(txt) < 3:
                continue
            firstName = txt[1]
            lastName = txt[2]
            # some players have names like 'J. J. Watt' and splitting by spaces would make it seem like the second 'J.' 
            # is their last name. This handles that case. Off the top of my head I could not think of an elegant solution
            # that would catch 100% of cases where I misname players, those will most likely have to be caught by hand
            # and fixed manually
            if ('.' in firstName) and ('.' in lastName):
                firstName = firstName + ' ' + lastName
                txt.remove(lastName)
                lastName = txt[2]
            try:
                # some people have their positions listed as 'G/T' because they are versatile. This code accounts for that
                positions = txt[3]
                finalPositions = ['NA', 'NA', 'NA']
                if positions in validPositions:
                    finalPositions[0] = positions
                    
                elif ('/' in positions): # some people have their positions listed as G/T for instance
                    positions =  positions.split('/')
                    for index in range(len(positions)):
                        if (not positions[index] in validPositions):
                            print()
                            print("WARNING: ASSUMED THAT " + positions[index] + ' WAS NOT A POSITION. PLEASE CORRECT validPositions' +
                             'AND RERUN CODE')
                            print('verify that the following information does not correspond to a player: ')
                            print(txt)
                            print('on the ' + str(year) + ' ' + teamName + '\'s wikipedia page')
                            print()
                        else:
                            finalPositions[index] = positions[index]
                            
                elif (positions in coachingGroups):
                    pass # we only want players
                
                elif (not positions in validPositions):
                    # some people have multiple last names, separated by spaces. This attempts to correct that
                    lastName = lastName + ' ' + positions
                    positions = txt[4]
                
                    finalPositions[0] = positions
                    
                
            except:
                finalPositions = [positionGroupToPosition(positionGroup, 'NA'), 'NA', 'NA'] 
                
            # Seattle added *s next to players names if they were in the pro bowl or some other award. This removes that
            lastName = lastName.replace('*', '').rstrip()
            nameID = firstName[0] + '.' + lastName
            numberNameID = str(number) + '-' + nameID
            # the row to be added for this particular player
            row = [year, city, teamName, positionGroup.lower(), firstName, lastName, number, 
                   positionGroupToPosition(positionGroup, finalPositions[0]),
                   finalPositions[0], finalPositions[1], finalPositions[2],
                  nameID, numberNameID, 'NA']
            # only add the row if the player is actually a player. Without this statement you risk some coaches sliding
            # in. A better way to do this would have been to make sure I was passing the right table earlier in the code
            # if anyone revises this code, start there. 
            if positionGroup.lower() in validPositionGroups:
                tempDF.loc[len(tempDF.index)] = row
            elif positionGroup in coachingGroups:
                pass
            # some positions were unexpected for me. Some teams had 'undrafted free agents' as a position group for instance.
            # these print statements helped me catch those issues. 
            else:
                print()
                print("should " + positionGroup + ' be considered valid?')
                print('verify that the following information does not correspond to a player: ')
                print(row)
                print('on the ' + str(year) + ' ' + teamName + '\'s wikipedia page')
                print()
    return(tempDF)    

# posArg is their header on wikipedia, like 'quarterbacks'. expected group is what they *should* be in.
# is used for unsigned draft picks
def positionGroupToPosition(posArg, expectedGroup):
    posGroup = posArg.lower()
    if posGroup == 'quarterbacks':
        return "QB"
    elif posGroup == 'running backs':
        return "RB"
    elif posGroup == 'wide receivers':
        return "WR"
    elif posGroup == 'tight ends':
        return "TE"
    elif posGroup == 'offensive linemen':
        return "OL"
    elif posGroup == 'defensive linemen':
        return "DL"
    elif posGroup == 'linebackers':
        return "LB"
    elif posGroup == 'defensive backs':
        return "DB"
    elif posGroup == 'special teams':
        return "ST"
    elif posGroup == 'injured reserve':             
        return positionToPosGroup(expectedGroup)
    elif posGroup == 'active/physically unable to perform':
        return positionToPosGroup(expectedGroup)
    elif posGroup == 'active/non-football injury':
        return positionToPosGroup(expectedGroup)
    elif posGroup == 'reserve list':             
        return positionToPosGroup(expectedGroup) 
    elif posGroup == 'reserve lists':            
        return positionToPosGroup(expectedGroup)
    elif posGroup == 'active/reserve lists':
        return positionToPosGroup(expectedGroup)
    elif posGroup == 'practice squad':   
        return positionToPosGroup(expectedGroup)
    elif posGroup == 'unsigned draft picks':
        return positionToPosGroup(expectedGroup)
    elif posGroup == 'suspended':
        return positionToPosGroup(expectedGroup)
    elif posGroup in ['unrestricted free agents', 'restricted fas', 
                      'unrestricted fas', 'exclusive-rights fas']:
        return positionToPosGroup(expectedGroup)
        

def positionToPosGroup(pos):
    if pos == 'QB':
        return 'QB'
    if (pos == 'HB') or (pos == 'RB') or (pos == 'H-B') or (pos == 'FB'):
        return 'RB'
    if pos == 'WR':
        return 'WR'
    if pos == 'TE':
        return 'TE'
    if (pos == 'T') or (pos == 'LT') or (pos == 'RT') or (pos == 'RG') or (pos == 'LG') or (pos == 'G') or (pos == 'C') or (pos == 'OL') or (pos == 'OT'):
        return 'OL'
    if (pos == 'DL') or (pos == 'NT') or (pos == 'DT') or (pos == 'DE') or (pos == 'RE') or (pos == 'LE'):
        return 'DL'
    if pos[-2:] == 'LB':
        return 'LB'
    if (pos == 'SS') or (pos == 'FS') or (pos == 'S') or (pos == 'CB'):
        return 'DB'
    if (pos == 'LS') or (pos == 'K') or (pos == 'P') or (pos == 'H'):
        return 'ST'
    if (pos == 'RS') or (pos == 'KR') or (pos == 'PR'):
        print('had to add a return specialist because of undrafted player, real group unknown')
        return 'RS'
    else:
        return 'unknown'
    
# place is 0 if it is at the start, 1 if at the end
def printStars(place):
    if place == 0:
        print('\n*********************************************')
        
    else:
        print('*********************************************\n')
        
    

In [3]:
masterDataFrame = pandas.DataFrame(columns=["Year", "City", "TeamName", "PositionGroup", "FirstName", "LastName", 
                                            "JerseyNumber", "RosterStatus", "PositionOne", "PositionTwo", "PositionThree",
                                           'NameID', "NumberNameID", "ID"])
# note that the range function goes from year A to year B-1. So range(2000,2020) will do seasons 2000-2019. 
keyVals = list(NAMETOCITY.keys())
oneTeam = ['Eagles']
for teamName in keyVals:
    for year in range(1999,2020):
        print('scraping ' + teamName + ' for year ' + str(year))
        
        # the code on either end of this if statement is more or less identical, but I am not going ot fix that by creating
        # a new function and using that to make it more readable right now. Maybe on a future version of this code if I come
        # back to it. 
        if (type(NAMETOCITY[teamName]) == list):
            for city in NAMETOCITY[teamName]:
                try:
                    page = getSoup(teamName, city, year)
                except: 
                    print("\n WARNING: FAILED TO GET HTML FOR " + teamName + " FOR THE YEAR " + str(year) + '\n')

                tbs = page.findAll("tbody")


                # this is where many of my issues came from. Do not do this. This is not the best way to find the
                # table you are looking for.All of the tables (i think) are headed like '2019 Philadelphia Eagles final roster'
                # just search for that and you will never get any coaches or weird position groups like UDFAs
                containsQBs = list()
                for elem in tbs:
                    if "Quarterbacks" in elem.text:
                            containsQBs.append(elem)
                            
                # the below code tries to get the roster. If it fails (for any of the reasons above) then we try again with
                # the next table. As far as im aware the most it will have to go through this for loop is 4 times. 
                try:
                    for possibleRosterTable in containsQBs:
                            #rosterAsSoup = containsQBs[1] #From the pages I checked it was always the second element in the list, hopefully that stays true
                        roster = getRoster(possibleRosterTable, city, teamName, year)
                        if type(roster) == None:
                            print("WARNING: FAILED TO GET ROSTER FOR " + teamName + " FOR THE YEAR " + str(year))
                            print("\t it is recommended that you see how widespread the issue is and whether or not it makes sense to" + 
                                     "just copy those rosters by hand. One known potential issue is if the wikipedia structure doesn't" + 
                                     "match the expected layout")
                        elif len(roster.index) == 0:
                            pass #case where it isnt none, it is just an empty dataframe
                        else: 
                            break
                except:
                    pass
                if (len(roster.index) < 30) or (len(roster.index) > 90):
                    printStars(0)
                    print('number of rows added:' + str(len(roster.index)) + ' for ' + teamName + ' in ' + str(year))
                    printStars(1)

                masterDataFrame = masterDataFrame.append(roster, ignore_index = True)
        else:
            city = NAMETOCITY[teamName]
            try:
                page = getSoup(teamName, city, year)
            except: 
                print("\n WARNING: FAILED TO GET HTML FOR " + teamName + " FOR THE YEAR " + str(year) + '\n')
                
            tbs = page.findAll("tbody")

            
            containsQBs = list()
            for elem in tbs:
                if "Quarterbacks" in elem.text:
                        containsQBs.append(elem)
            try:
                for possibleRosterTable in containsQBs:
                        #rosterAsSoup = containsQBs[1] #From the pages I checked it was always the second element in the list, hopefully that stays true
                    roster = getRoster(possibleRosterTable, city, teamName, year)
                    if type(roster) == None:
                        print("WARNING: FAILED TO GET ROSTER FOR " + teamName + " FOR THE YEAR " + str(year))
                        print("\t it is recommended that you see how widespread the issue is and whether or not it makes sense to" + 
                                 "just copy those rosters by hand. One known potential issue is if the wikipedia structure doesn't" + 
                                 "match the expected layout")
                    elif len(roster.index) == 0:
                        pass #case where it isnt none, it is just an empty dataframe
                    else: 
                        break
            except:
                pass
            if len(roster.index) == 0:
                print('number of rows added:' + str(len(roster.index)) + ' for ' + teamName + ' in ' + str(year))
                
            if (len(roster.index) < 30) or (len(roster.index) > 90):
                    printStars(0)
                    print('number of rows added:' + str(len(roster.index)) + ' for ' + teamName + ' in ' + str(year))
                    printStars(1)

            
            masterDataFrame = masterDataFrame.append(roster, ignore_index = True)
            
print('*********** DONE *********************')

scraping Cardinals for year 1999
scraping Cardinals for year 2000
scraping Cardinals for year 2001
scraping Cardinals for year 2002
scraping Cardinals for year 2003
scraping Cardinals for year 2004
scraping Cardinals for year 2005
scraping Cardinals for year 2006
scraping Cardinals for year 2007
scraping Cardinals for year 2008
scraping Cardinals for year 2009
scraping Cardinals for year 2010
scraping Cardinals for year 2011
scraping Cardinals for year 2012
scraping Cardinals for year 2013
scraping Cardinals for year 2014
scraping Cardinals for year 2015
scraping Cardinals for year 2016
scraping Cardinals for year 2017
scraping Cardinals for year 2018
scraping Cardinals for year 2019

verify that the following information does not correspond to a player: 
['Bill', 'Bidwill', 'Coaching', 'Fellowship/Running', 'Backs', '–', 'Don', 'Shumpert']
on the 2019 Cardinals's wikipedia page


verify that the following information does not correspond to a player: 
['Bill', 'Bidwill', 'Coaching', 'F

scraping Ravens for year 1999
scraping Ravens for year 2000
scraping Ravens for year 2001
scraping Ravens for year 2002
scraping Ravens for year 2003
scraping Ravens for year 2004
scraping Ravens for year 2005
scraping Ravens for year 2006
scraping Ravens for year 2007
scraping Ravens for year 2008
scraping Ravens for year 2009
scraping Ravens for year 2010
scraping Ravens for year 2011
scraping Ravens for year 2012
scraping Ravens for year 2013
scraping Ravens for year 2014

verify that the following information does not correspond to a player: 
['Defensive', 'Quality', 'Control', '/', 'Linebackers', 'Assistant', '–', 'Matt', 'Weiss']
on the 2014 Ravens's wikipedia page


verify that the following information does not correspond to a player: 
['Defensive', 'Quality', 'Control', '/', 'Linebackers', 'Assistant', '–', 'Matt', 'Weiss']
on the 2014 Ravens's wikipedia page

scraping Ravens for year 2015
scraping Ravens for year 2016
scraping Ravens for year 2017
scraping Ravens for year 201

scraping Broncos for year 2017

verify that the following information does not correspond to a player: 
['President', 'of', 'Football', 'Operations/General', 'Manager', '–', 'John', 'Elway']
on the 2017 Broncos's wikipedia page


verify that the following information does not correspond to a player: 
['President', 'of', 'Football', 'Operations/General', 'Manager', '–', 'John', 'Elway']
on the 2017 Broncos's wikipedia page

scraping Broncos for year 2018

verify that the following information does not correspond to a player: 
['President', 'of', 'football', 'operations/general', 'manager', '–', 'John', 'Elway']
on the 2018 Broncos's wikipedia page


verify that the following information does not correspond to a player: 
['President', 'of', 'football', 'operations/general', 'manager', '–', 'John', 'Elway']
on the 2018 Broncos's wikipedia page

scraping Broncos for year 2019

verify that the following information does not correspond to a player: 
['President', 'of', 'Football', 'Operation


verify that the following information does not correspond to a player: 
['Executive', 'Vice', 'President/General', 'Manager/Director', 'of', 'Football', 'Operations', '–', 'Ted', 'Thompson']
on the 2015 Packers's wikipedia page


verify that the following information does not correspond to a player: 
['Executive', 'Vice', 'President/General', 'Manager/Director', 'of', 'Football', 'Operations', '–', 'Ted', 'Thompson']
on the 2015 Packers's wikipedia page

scraping Packers for year 2016
scraping Packers for year 2017
scraping Packers for year 2018
scraping Packers for year 2019
scraping Texans for year 1999
scraping Texans for year 2000
scraping Texans for year 2001
scraping Texans for year 2002
scraping Texans for year 2003
scraping Texans for year 2004
scraping Texans for year 2005
scraping Texans for year 2006
scraping Texans for year 2007
scraping Texans for year 2008
scraping Texans for year 2009
scraping Texans for year 2010
scraping Texans for year 2011
scraping Texans for year 2

scraping Vikings for year 2017
scraping Vikings for year 2018
scraping Vikings for year 2019
scraping Patriots for year 1999

should Notations be considered valid?
verify that the following information does not correspond to a player: 
[1999, 'New-England', 'Patriots', 'notations', '1999', 'Rookie', 'R:', None, None, 'NA', 'NA', '1.Rookie', 'R:-1.Rookie', 'NA']
on the 1999 Patriots's wikipedia page


should Notations be considered valid?
verify that the following information does not correspond to a player: 
[1999, 'New-England', 'Patriots', 'notations', '1999', 'Undrafted Rookie', 'UR:', None, None, 'NA', 'NA', '1.Undrafted Rookie', 'UR:-1.Undrafted Rookie', 'NA']
on the 1999 Patriots's wikipedia page


should Notations be considered valid?
verify that the following information does not correspond to a player: 
[1999, 'New-England', 'Patriots', 'notations', 'players', 'are not', 'Italicized', None, 'on', 'NA', 'NA', 'p.are not', 'Italicized-p.are not', 'NA']
on the 1999 Patriots's wik


should Notations be considered valid?
verify that the following information does not correspond to a player: 
[2010, 'New-England', 'Patriots', 'notations', '2010', 'Rookie', 'R:', None, None, 'NA', 'NA', '2.Rookie', 'R:-2.Rookie', 'NA']
on the 2010 Patriots's wikipedia page


should Notations be considered valid?
verify that the following information does not correspond to a player: 
[2010, 'New-England', 'Patriots', 'notations', '2010', 'Undrafted Rookie', 'UR:', None, None, 'NA', 'NA', '2.Undrafted Rookie', 'UR:-2.Undrafted Rookie', 'NA']
on the 2010 Patriots's wikipedia page


should Notations be considered valid?
verify that the following information does not correspond to a player: 
[2010, 'New-England', 'Patriots', 'notations', 'players', 'are not', 'Italicized', None, 'on', 'NA', 'NA', 'p.are not', 'Italicized-p.are not', 'NA']
on the 2010 Patriots's wikipedia page

scraping Patriots for year 2011
scraping Patriots for year 2012
scraping Patriots for year 2013
scraping Patriot

scraping Jets for year 2009
scraping Jets for year 2010
scraping Jets for year 2011
scraping Jets for year 2012

verify that the following information does not correspond to a player: 
['Assistant', 'Strength', 'and', 'Conditioning/Strength', 'Assistant', '–', 'Paul', 'Ricci']
on the 2012 Jets's wikipedia page


verify that the following information does not correspond to a player: 
['Assistant', 'Strength', 'and', 'Conditioning/Strength', 'Assistant', '–', 'Paul', 'Ricci']
on the 2012 Jets's wikipedia page

scraping Jets for year 2013
scraping Jets for year 2014
scraping Jets for year 2015

should Special teams Coaches be considered valid?
verify that the following information does not correspond to a player: 
[2015, 'New-York', 'Jets', 'special teams coaches', 'Teams', '– Bobby', 'Special', None, 'April', 'NA', 'NA', 'T.– Bobby', 'Special-T.– Bobby', 'NA']
on the 2015 Jets's wikipedia page


should Special teams Coaches be considered valid?
verify that the following information does 

scraping Chargers for year 2001
scraping Chargers for year 2002
scraping Chargers for year 2003
scraping Chargers for year 2004
scraping Chargers for year 2005
scraping Chargers for year 2006
scraping Chargers for year 2007
scraping Chargers for year 2008
scraping Chargers for year 2009
scraping Chargers for year 2010
scraping Chargers for year 2011
scraping Chargers for year 2012
scraping Chargers for year 2013
scraping Chargers for year 2014
scraping Chargers for year 2015
scraping Chargers for year 2016
scraping Chargers for year 2017
scraping Chargers for year 2018
scraping Chargers for year 2019
scraping 49ers for year 1999
scraping 49ers for year 2000
scraping 49ers for year 2001
scraping 49ers for year 2002
scraping 49ers for year 2003
scraping 49ers for year 2004
scraping 49ers for year 2005
scraping 49ers for year 2006
scraping 49ers for year 2007
scraping 49ers for year 2008
scraping 49ers for year 2009
scraping 49ers for year 2010
scraping 49ers for year 2011
scraping 49ers 

scraping Titans for year 2003

verify that the following information does not correspond to a player: 
['Founder/Owner/Chairman', 'of', 'the', 'Board/CEO', '–', 'Bud', 'Adams']
on the 2003 Titans's wikipedia page


verify that the following information does not correspond to a player: 
['Founder/Owner/Chairman', 'of', 'the', 'Board/CEO', '–', 'Bud', 'Adams']
on the 2003 Titans's wikipedia page

scraping Titans for year 2004

verify that the following information does not correspond to a player: 
['Founder/Owner/Chairman', 'of', 'the', 'Board/President/CEO', '–', 'Bud', 'Adams']
on the 2004 Titans's wikipedia page


verify that the following information does not correspond to a player: 
['Founder/Owner/Chairman', 'of', 'the', 'Board/President/CEO', '–', 'Bud', 'Adams']
on the 2004 Titans's wikipedia page


verify that the following information does not correspond to a player: 
['Founder/Owner/Chairman', 'of', 'the', 'Board/President/CEO', '–', 'Bud', 'Adams']
on the 2004 Titans's wikipe

In [4]:
masterDataFrame.to_csv('1999_2019_NFL_ROSTERS.csv')

In [6]:
masterDataFrame.shape

(50308, 14)

In [19]:
Counter(masterDataFrame[(masterDataFrame.Year == 2009) & (masterDataFrame.TeamName == 'Falcons')].PositionGroup)

Counter({'quarterbacks': 3,
         'running backs': 6,
         'wide receivers': 5,
         'tight ends': 4,
         'offensive linemen': 8,
         'defensive linemen': 9,
         'linebackers': 6,
         'defensive backs': 8,
         'special teams': 4,
         'reserve lists': 11,
         'practice squad': 11})

In [66]:
wilson = masterDataFrame.LastName[1]

In [17]:
testList = [1,2,3,4]
for number in testList:
    if number == 3:
        continue
    print(number)

1
2
4


In [45]:
masterDataFrame[(masterDataFrame.TeamName == 'Ravens') &
               (masterDataFrame.Year == 2009)]

Unnamed: 0,Year,City,TeamName,PositionGroup,FirstName,LastName,JerseyNumber,RosterStatus,PositionOne,PositionTwo,PositionThree
0,2009,Baltimore,Ravens,quarterbacks,John,Beck,12,QB,QB,,
1,2009,Baltimore,Ravens,quarterbacks,Joe,Flacco,5,QB,QB,,
2,2009,Baltimore,Ravens,quarterbacks,Troy,Smith,10,QB,QB,,
3,2009,Baltimore,Ravens,running backs,Jason,Cook,39,RB,FB,,
4,2009,Baltimore,Ravens,running backs,Matt,Lawrence,32,RB,RB,,
5,2009,Baltimore,Ravens,running backs,Le'Ron,McClain,33,RB,FB,,
6,2009,Baltimore,Ravens,running backs,Jalen,Parmele,34,RB,RB,,
7,2009,Baltimore,Ravens,running backs,Cedric,Peerman,38,RB,RB,,
8,2009,Baltimore,Ravens,running backs,Ray,Rice,27,RB,RB,,
9,2009,Baltimore,Ravens,wide receivers,Drew,Bennett,18,WR,WR,,


In [35]:
lbString = 'LB'
lbString[-2:]

'LB'

In [13]:
testSoup = getSoup(city='Seattle', teamName='Seahawks', year = 2013)

In [14]:
testSoup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en"><head>
<meta charset="utf-8"/>
<title>2013 Seattle Seahawks season - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"24129ca0-5b4c-468b-9d57-6b732cecac47","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"2013_Seattle_Seahawks_season","wgTitle":"2013 Seattle Seahawks season","wgCurRevisionId":959355204,"wgRevisionId":959355204,"wgArticleId":37982259,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Pages using deprecated image syntax","Articles that may contain original research from Jan

In [35]:
markup = r"""<li><span style="font-family: 'Octin Sports','Octin College','Octin Stencil','Octin Vintage','Octin Prison', 'Courier New','Courier', monospace;"><b>18</b></span><b> <a href="/wiki/Sidney_Rice" title="Sidney Rice">Sidney Rice</a> WR</b> <small>(IR)</small> <a class="image" href="/wiki/File:Injury_icon_2.svg"><img alt="Injury icon 2.svg" data-file-height="300" data-file-width="300" decoding="async" height="10" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/61/Injury_icon_2.svg/10px-Injury_icon_2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/61/Injury_icon_2.svg/15px-Injury_icon_2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/61/Injury_icon_2.svg/20px-Injury_icon_2.svg.png 2x" width="10"/></a></li>'"""
rice = BeautifulSoup(markup, 'html5lib')
rice

<html><head></head><body><li><span style="font-family: 'Octin Sports','Octin College','Octin Stencil','Octin Vintage','Octin Prison', 'Courier New','Courier', monospace;"><b>18</b></span><b> <a href="/wiki/Sidney_Rice" title="Sidney Rice">Sidney Rice</a> WR</b> <small>(IR)</small> <a class="image" href="/wiki/File:Injury_icon_2.svg"><img alt="Injury icon 2.svg" data-file-height="300" data-file-width="300" decoding="async" height="10" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/61/Injury_icon_2.svg/10px-Injury_icon_2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/61/Injury_icon_2.svg/15px-Injury_icon_2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/61/Injury_icon_2.svg/20px-Injury_icon_2.svg.png 2x" width="10"/></a></li>'</body></html>

In [41]:
rice = rice.next
rice

<b>18</b>

In [45]:
rice.parent.parent.name

'li'

In [84]:
wilson = 'Wilson *	'

In [87]:
wilson.replace('\t', '').rstrip()

'Wilson *'

In [78]:
'Wilson *	'

'Wilson *\t'

'*'