In [2]:
from bs4 import BeautifulSoup
import requests
import time, os

import pandas as pd
import numpy as np

In [3]:
columns = ['year', 'name', 'position', 'ht', 'wt', 'fourty', 'vert', 'bench', 'broad' , 'three_cone', 'shuttle']

def scrape_table(year):
    '''Input a year and it will scrape the combine data from the url with that year'''
    
    year_url = 'https://www.pro-football-reference.com/draft/{}-combine.htm'.format(year)
    response = requests.get(year_url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    combine_table = soup.find_all('tr')
    
    player_list = []

    for row in combine_table[1:]:
        try:
            name = row.find('th').text
            pos = row.find_all('td')[0].text
        #    school = row.find_all('td')[1].text    #Don't need this field
        #    college_link = row.find_all('td')[2].text    #Don't need this field
            ht = row.find_all('td')[3].text
            wt = row.find_all('td')[4].text
            fourty = row.find_all('td')[5].text
            vert = row.find_all('td')[6].text
            bench = row.find_all('td')[7].text
            broad = row.find_all('td')[8].text
            three_cone = row.find_all('td')[9].text
            shuttle = row.find_all('td')[10].text
        #    drafted = row.find_all('td')[11].text    #Don't need this field
            player = [year, name, pos, ht, wt, fourty, vert, bench, broad, three_cone, shuttle]
            #Blank (unreported) show up as "" so we are making those np.nan
            player = [element if element != "" else np.nan for element in player]
            player_list.append(player)
        except IndexError:
            #Every fifty or so rows the headings reappear. This will make sure that is the error then skip
            #over that row.
            if row.find('th').text == 'Player':
                pass
    
    grad_class = pd.DataFrame(data = player_list, columns = columns)
    
    return grad_class

In [4]:
#Initiate the data frame by making a data frame for the first year. We will append subsequent years.
all_data = scrape_table(2000)

years = [str(year) for year in range(2001, 2022)]

#Makes a data frame for that year and appends the data we have.
for year in years:
    year_df = scrape_table(year)
    all_data = all_data.append(year_df, ignore_index=True)
    
all_data

Unnamed: 0,year,name,position,ht,wt,fourty,vert,bench,broad,three_cone,shuttle
0,2000,John Abraham,OLB,6-4,252,4.55,,,,,
1,2000,Shaun Alexander,RB,6-0,218,4.58,,,,,
2,2000,Darnell Alford,OT,6-4,334,5.56,25.0,23,94,8.48,4.98
3,2000,Kyle Allamon,TE,6-2,253,4.97,29.0,,104,7.29,4.49
4,2000,Rashard Anderson,CB,6-2,206,4.55,34.0,,123,7.18,4.15
...,...,...,...,...,...,...,...,...,...,...,...
7351,2021,Brock Wright,TE,6-4,257,4.62,31.5,26,118,7.27,4.25
7352,2021,Nahshon Wright,DB,6-4,183,4.46,31.0,6,126,7.24,4.57
7353,2021,Kenny Yeboah,TE,6-3,250,,34.0,,116,,
7354,2021,Devonte Young,DB,5-11,195,4.70,32.0,,118,7.41,4.38


In [5]:
all_data.to_pickle('all_data.pkl')