# Loading Libraries

In [1]:
import requests
import re
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np

# Defining get_stats function

In [2]:
def get_stats(table, columns):
  # this function takes the stats table and returns a dataframe of the stats
  
  # define the DataFrame
  df = pd.DataFrame(columns=columns)  

  # finds all rows in the table
  rows = table[0].find_all('tr')

  # this counter will be used along with total_games to get the total games played in the final row of the dataframe.
  row_counter = 3
  total_games = 0

  # loop over each row in the table, skipping over header rows (0,1)
  for row in rows[2:]:
    # creating a list where we will store the stats for each row.  We will then append this list to our df
    stat_list = []
    # this part is for the first rows of our table.  these rows contain the year in an 'a' tag
    if row_counter<(len(rows)):
      # creating cells which contains all the relevant stats inside td tags, which we pull using text.
      cells = row.find_all('td')
      # this variable will be used to find cell 5.  cell 5 contains the total games.
      cell_counter=0
      # appending the year to our list... the year is contained in an 'a' tag
      stat_list.append(row.find('a').text)
      # loop for pulling all the relevant stats from cells
      for cell in cells:
        # adding a section so the last line, career will also have the conf, class and pos
        if cell_counter == 1:
          stat_list.append(cell.text)
          conf = cell.text
          cell_counter +=1
        elif cell_counter == 2:
          stat_list.append(cell.text)
          clas = cell.text
          cell_counter +=1
        elif cell_counter == 3:
          stat_list.append(cell.text)
          position = cell.text
          cell_counter +=1

        # when the cell count is 4, that cell contains the games played... we want to total this for use in the last column 
        elif cell_counter == 4:
          games = cell.text
          # try except for catch nulls error
          try:
            games = int(games)
            total_games += games
          except ValueError:
            total_games += 0
          stat_list.append(games)
          cell_counter +=1
        # this is for all the other cells
        else:
          stat_list.append(cell.text)
          cell_counter +=1
        
    # this part is for the last row of our table we are scrapping. it contains career stats
    else:
      cells = row.find_all('td')
      cell_counter=0
      # this last column does not contain year.
      stat_list.append('Career')
      for cell in cells:
        if cell_counter == 1:
          stat_list.append(conf)
          cell_counter +=1
        elif cell_counter == 2:
          stat_list.append(clas)
          cell_counter +=1
        elif cell_counter == 3:
          stat_list.append(position)
          cell_counter +=1
        # cell 4 contains total games
        elif cell_counter == 4:
          stat_list.append(total_games)
          cell_counter +=1
        else:
          stat_list.append(cell.text)
          cell_counter +=1
    df = df.append(pd.DataFrame([stat_list], columns=columns))
    row_counter+=1
  return df

# Defining Offensive Columns

In [3]:
pass_columns = ['year','school','conference','class','position','games','completions','attempts','comp_pct','passing_yds','yds_per_att','adj_yds_per_att','passing_td','interceptions','pass_eff_rate']
rush_columns = ['year','school','conference','class','position','games','rush_attempts','rush_yds','rush_avg','rush_td','receptions','rec_yds','rec_avg','rec_td','total_plays','total_yds','total_avg','total_td']
rec_columns = ['year','school','conference','class','position','games','receptions','rec_yds','rec_avg','rec_td','rush_attempts','rush_yds','rush_avg','rush_td','total_plays','total_yds','total_avg','total_td']

# looking at WR

In [4]:
url = 'https://www.sports-reference.com/cfb/players/jaylen-waddle-1.html'

In [5]:
r = requests.get(url)
page = BeautifulSoup(r.content, 'html.parser')

In [6]:
tables = page.find_all('table')

In [7]:
len(tables)

1

In [8]:
type(tables)

bs4.element.ResultSet

## testing function

In [9]:
rows = tables[0].find_all('tr')

In [10]:
rows[2].find('a')

<a href="/cfb/years/2018.html">2018</a>

In [11]:
rec_df = get_stats(tables, rec_columns)

  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))


In [12]:
rec_df

Unnamed: 0,year,school,conference,class,position,games,receptions,rec_yds,rec_avg,rec_td,rush_attempts,rush_yds,rush_avg,rush_td,total_plays,total_yds,total_avg,total_td
0,2018,Alabama,SEC,FR,WR,15,45,848,18.8,7,0,0,,0,45,848,18.8,7
0,2019,Alabama,SEC,SO,WR,13,33,560,17.0,6,1,5,5.0,0,34,565,16.6,6
0,2020,Alabama,SEC,JR,WR,6,28,591,21.1,4,3,12,4.0,0,31,603,19.5,4
0,Career,Alabama,SEC,JR,WR,34,106,1999,18.9,17,4,17,4.3,0,110,2016,18.3,17


In [13]:
print(page.prettify())

<!DOCTYPE html>
<html class="no-js" data-root="/home/cfb/build" data-version="klecko-" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
   <link href="https://cdn.ssref.net/req/202303021" rel="dns-prefetch"/>
   <!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
   <script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://cmp.quantcast.com'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, 
		    '/choice.js?tag_version=V2');
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScript);
	
	function makeStub() {
	    var TCF_LOCATOR_NAM

## Results:
- Ok so looking at a WR table, we only get only table using find all for table tag. this is somewhat to be expected.
- BUT the table object we are getting is not a bs4 object, but a ResultSet... this required changing our function code and adding a [0]
- Should be noted that the table id = receiving
-- will have to check RB's to see what their table is named

## Testing Punt RET with offensive players

In [14]:
all_punt_ret = page.find(id="all_punt_ret")

punt_ret = all_punt_ret(text=lambda text: isinstance(text, Comment))[0]

In [15]:
soup=BeautifulSoup(punt_ret, 'html.parser')

In [16]:
table = soup.find('table', {'id': 'punt_ret'})
#columns
columns2= ['Year','School','Conf','Class','Pos','G','PuntRet','PuntYds','PuntAvg','PuntTD','KickRet','KickYds','KickAvg','KickTD']

# Create empty DataFrame with columns
df2 = pd.DataFrame(columns=columns2)

# Loop through each row of the table
rows= table.find_all('tr')

# rows 0 and 1 have useless text in them
# so we are creating a counter to start at 3, we use this counter to determine when we are on the last row and then can enter 'Career' for the year.
# we will also use it to get the total games played.
row_counter= 3 
total_games=0
# starting our loop
for row in rows[2:]:
  # creating a list where we will store the stats for each row.  We will then append this list to our df
  stat_list = []
  # this part is for the first rows of our table.  these rows contain the year in an 'a' tag
  if row_counter<(len(rows)):
    # creating cells which contains all the relevant stats inside td tags, which we pull using text.
    cells = row.find_all('td')
    # this variable will be used to find cell 5.  cell 5 contains the total games.
    cell_counter=0
    # appending the year to our list... the year is contained in an 'a' tag
    stat_list.append(row.find('a').text)
    # loop for pulling all the relevant stats from cells
    for cell in cells:
      # when the cell count is 4, that cell contains the games played... we want to total this for use in the last column 
      if cell_counter == 4:
        games = cell.text
        # try except for catch nulls error
        try:
          games = int(games)
          total_games += games
        except ValueError:
          total_games += 0
        stat_list.append(games)
        cell_counter +=1
      # this is for all the other cells
      else:
        stat_list.append(cell.text)
        cell_counter +=1
      
  # this part is for the last row of our table we are scrapping. it contains career stats
  else:
    cells = row.find_all('td')
    cell_counter=0
    # this last column does not contain year.
    stat_list.append('Career')
    for cell in cells:
      # cell 4 contains total games
      if cell_counter == 4:
        stat_list.append(total_games)
        cell_counter +=1
      else:
        stat_list.append(cell.text)
        cell_counter +=1
  df2 = df2.append(pd.DataFrame([stat_list], columns=columns2))
  row_counter+=1

  df2 = df2.append(pd.DataFrame([stat_list], columns=columns2))
  df2 = df2.append(pd.DataFrame([stat_list], columns=columns2))
  df2 = df2.append(pd.DataFrame([stat_list], columns=columns2))
  df2 = df2.append(pd.DataFrame([stat_list], columns=columns2))


In [17]:
df2

Unnamed: 0,Year,School,Conf,Class,Pos,G,PuntRet,PuntYds,PuntAvg,PuntTD,KickRet,KickYds,KickAvg,KickTD
0,2018,Alabama,SEC,FR,WR,15,16,233,14.6,1,0,0,,0
0,2019,Alabama,SEC,SO,WR,13,20,487,24.4,1,5,175,35.0,1
0,2020,Alabama,SEC,JR,WR,6,2,13,6.5,0,4,39,9.8,0
0,Career,Alabama,,,,34,38,733,19.3,2,9,214,23.8,1


### Results:
awesome!  Punt return works the same for offense and defense... now time to check other positions

# Looking at RB:

In [18]:
url = 'https://www.sports-reference.com/cfb/players/najee-harris-1.html'

In [19]:
r = requests.get(url)
page = BeautifulSoup(r.content, 'html.parser')

In [20]:
tables = page.find_all('table')

In [21]:
len(tables)

1

In [22]:
tables

[<table class="sortable stats_table" data-cols-to-freeze=",1" id="rushing">
 <caption>Rushing &amp; Receiving Table</caption>
 <colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
 <thead>
 <tr class="over_header">
 <th aria-label="" class="over_header center" colspan="5" data-stat=""></th><th></th>
 <th aria-label="" class="over_header center" colspan="4" data-stat="header_rushing">Rushing</th>
 <th aria-label="" class="over_header center" colspan="4" data-stat="header_receiving">Receiving</th>
 <th aria-label="" class="over_header center" colspan="4" data-stat="header_scrimmage">Scrimmage</th>
 </tr>
 <tr>
 <th aria-label="Year" class="poptip sort_default_asc right" data-stat="year_id" scope="col">Year</th>
 <th aria-label="School" class="poptip sort_default_asc left" data-stat="school_name" data-tip="School Name" scope="col">School</th>
 <th aria-label="Conf" class="poptip sort_default_asc left" data-stat="

## testing function

In [23]:
rush_df = get_stats(tables, rush_columns)

  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))


In [24]:
rush_df

Unnamed: 0,year,school,conference,class,position,games,rush_attempts,rush_yds,rush_avg,rush_td,receptions,rec_yds,rec_avg,rec_td,total_plays,total_yds,total_avg,total_td
0,2017,Alabama,SEC,FR,RB,10,61,370,6.1,3,6,45,7.5,0,67,415,6.2,3
0,2018,Alabama,SEC,SO,RB,15,117,783,6.7,4,4,7,1.8,0,121,790,6.5,4
0,2019,Alabama,SEC,JR,RB,13,209,1224,5.9,13,27,304,11.3,7,236,1528,6.5,20
0,2020,Alabama,SEC,SR,RB,13,251,1466,5.8,26,43,425,9.9,4,294,1891,6.4,30
0,Career,Alabama,SEC,SR,RB,51,638,3843,6.0,46,80,781,9.8,11,718,4624,6.4,57


## Results:
- Ok so this table... I just realized is listed as Rushing and Receiving... wheras a WR is listed as Receiving and Rushing.... seems kinda interesting they did that and unncessary but whatever.
-- the table id = rushing

# looking at QB:

In [25]:
url='https://www.sports-reference.com/cfb/players/justin-fields-2.html'

In [26]:
r = requests.get(url)
page = BeautifulSoup(r.content, 'html.parser')

In [27]:
tables = page.find_all('table')

In [28]:
len(tables)

1

In [29]:
type(tables)

bs4.element.ResultSet

## Testing Function

In [30]:
pass_df = get_stats(tables, pass_columns)

  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))


AttributeError: ignored

In [31]:
type(tables[0])

bs4.element.Tag

In [32]:
rows = tables[0].find_all('tr')

In [33]:
type(rows)

bs4.element.ResultSet

In [34]:
rows[2].find('a').text

'2018'

In [35]:
 for row in rows[2:]:
   print(row.find('a').text)

2018
2019
2020


AttributeError: ignored

In [36]:
rows[6]

<tr class="bold"><th class="right iz" data-stat="year_id" scope="row"></th><td class="left" data-stat="school_name"><a href="/cfb/schools/georgia/">Georgia</a></td><td class="left iz" data-stat="conf_abbr"></td><td class="center iz" data-stat="class"></td><td class="center iz" data-stat="pos"></td><td class="right iz" data-stat="g"></td><td class="right" data-stat="pass_cmp">27</td><td class="right" data-stat="pass_att">39</td><td class="right" data-stat="pass_cmp_pct">69.2</td><td class="right" data-stat="pass_yds">328</td><td class="right" data-stat="pass_yds_per_att">8.4</td><td class="right" data-stat="adj_pass_yds_per_att">10.5</td><td class="right" data-stat="pass_td">4</td><td class="right iz" data-stat="pass_int">0</td><td class="right" data-stat="pass_rating">173.7</td></tr>

## Results:
- Once again... as expected.  We only get one table.
-- The table id is 'passing'

- You can't see the in the code above, but the function works with a normal qb...  but luckily I tried it with my favorite QB, justin fields... who played for multiple teams... and this caused an error.  My function makes some incorrect assumptions that the last row will always be the career row. so I must fix my function now.

## Finding 'rushing' inside the qb page

In [37]:
print(page.prettify())

<!DOCTYPE html>
<html class="no-js" data-root="/home/cfb/build" data-version="klecko-" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
   <link href="https://cdn.ssref.net/req/202303021" rel="dns-prefetch"/>
   <!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
   <script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://cmp.quantcast.com'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, 
		    '/choice.js?tag_version=V2');
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScript);
	
	function makeStub() {
	    var TCF_LOCATOR_NAM

In [38]:
# ok, I think what we are looking for is 'id="all_rushing"'

In [39]:
all_rush = page.find(id="all_rushing")
rush = all_rush(text=lambda text: isinstance(text, Comment))[0]
print(rush)



<div class="table_container" id="div_rushing">
    
    <table class="sortable stats_table" id="rushing" data-cols-to-freeze=",1">
    <caption>Rushing &amp; Receiving Table</caption>
    

   <colgroup><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col></colgroup>
   <thead>

      
      <tr class="over_header">
         <th aria-label="" data-stat="" colspan="5" class=" over_header center" ></th><th></th>
         <th aria-label="" data-stat="header_rushing" colspan="4" class=" over_header center" >Rushing</th>
         <th aria-label="" data-stat="header_receiving" colspan="4" class=" over_header center" >Receiving</th>
         <th aria-label="" data-stat="header_scrimmage" colspan="4" class=" over_header center" >Scrimmage</th>
      </tr>
            
      <tr>
         <th aria-label="Year" data-stat="year_id" scope="col" class=" poptip sort_default_asc right" >Year</th>
         <th aria-label="School" data-stat="school_name" scope="co

In [40]:
all_rush

<div class="table_wrapper setup_commented commented" id="all_rushing">
<div class="section_heading assoc_rushing" id="rushing_sh">
<span class="section_anchor" data-label="Rushing &amp; Receiving" id="rushing_link"></span><h2>Rushing &amp; Receiving</h2> <div class="section_heading_text">
<ul><li>* indicates bowl stats included</li>
</ul>
</div>
</div><div class="placeholder"></div>
<!--

<div class="table_container" id="div_rushing">
    
    <table class="sortable stats_table" id="rushing" data-cols-to-freeze=",1">
    <caption>Rushing &amp; Receiving Table</caption>
    

   <colgroup><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col><col></colgroup>
   <thead>

      
      <tr class="over_header">
         <th aria-label="" data-stat="" colspan="5" class=" over_header center" ></th><th></th>
         <th aria-label="" data-stat="header_rushing" colspan="4" class=" over_header center" >Rushing</th>
         <th aria-label="" data-stat="header_rece

In [41]:
soup=BeautifulSoup(rush, 'html.parser')

In [42]:
table = soup.find('table', {'id': 'rushing'})

In [43]:
table

<table class="sortable stats_table" data-cols-to-freeze=",1" id="rushing">
<caption>Rushing &amp; Receiving Table</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr class="over_header">
<th aria-label="" class="over_header center" colspan="5" data-stat=""></th><th></th>
<th aria-label="" class="over_header center" colspan="4" data-stat="header_rushing">Rushing</th>
<th aria-label="" class="over_header center" colspan="4" data-stat="header_receiving">Receiving</th>
<th aria-label="" class="over_header center" colspan="4" data-stat="header_scrimmage">Scrimmage</th>
</tr>
<tr>
<th aria-label="Year" class="poptip sort_default_asc right" data-stat="year_id" scope="col">Year</th>
<th aria-label="School" class="poptip sort_default_asc left" data-stat="school_name" data-tip="School Name" scope="col">School</th>
<th aria-label="Conf" class="poptip sort_default_asc left" data-stat="conf_abbr" sco

## Results:
- Finding Rushing inside the QB page is just like finding Punt Ret in the other pages
-- It's commented out so you have to work around that
- just need to create columns and code for dealing with all different types of tables and we are good.

# Fixing our Function

In [44]:
import requests
import re
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np

In [45]:
url='https://www.sports-reference.com/cfb/players/justin-fields-2.html'
r = requests.get(url)
page = BeautifulSoup(r.content, 'html.parser')
tables = page.find_all('table')

In [46]:
rows = tables[0].find_all('tr')

In [47]:
rows[5].find('th').text

'Career'

In [48]:
rows[6].find('th').text

''

In [49]:
def get_stats(table, columns):
  # this function takes the stats table and returns a dataframe of the stats
  
  # define the DataFrame
  df = pd.DataFrame(columns=columns)  

  # finds all rows in the table
  rows = table[0].find_all('tr')

  # this counter will be used along with total_games to get the total games played in the final row of the dataframe and to index the dataframe to exit the loop.
  row_counter = 0
  total_games = 0

  # loop over each row in the table, skipping over header rows (0,1)
  for row in rows[2:]:
    # creating a list where we will store the stats for each row.  We will then append this list to our df
    stat_list = []
    
    if row.find('th').text == 'Career':
      stat_list.append(row.find('th').text)
    else:
    # this part is for the first rows of our table.  these rows contain the year in an 'a' tag
      try: 
        stat_list.append(row.find('a').text)
      except AttributeError:
        # after year values, we find the text 'Career' inside a th tag.
        try:
          stat_list.append(row.find('th').text)
        # if neither 'a' nor 'th' tags are present
        except AttributeError:
          break
        
    # cells will contain all the data inside our table td tags
    cells = row.find_all('td')
    
    # counter to be used for total games
    cell_counter=0
      
    # loop for pulling all the relevant stats from cells
    for cell in cells:
      # when the cell count is 4, that cell contains the games played... we want to total this for use in the last row
      if cell_counter == 4:
        games = cell.text
        # try/except for catch nulls error
        try:
          games = int(games)
          total_games += games
        except ValueError:
          total_games += 0
        stat_list.append(games)
        cell_counter +=1
      
      # this is for all the other cells
      else:
        stat_list.append(cell.text)
        cell_counter +=1
    
    df = df.append(pd.DataFrame([stat_list], columns=columns))
    # this is our break to stop the issue when a player has multiple colleges and they are tallied AFTER career
    if df.iloc[row_counter,0]=='Career':
      for i in range(2,5):
        df.iloc[row_counter,i]= df.iloc[row_counter-1,i]
      df.iloc[row_counter,5] = total_games
      break
    row_counter+=1
  return df

In [50]:
pass_df = get_stats(tables, pass_columns)

  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))


In [51]:
pass_df

Unnamed: 0,year,school,conference,class,position,games,completions,attempts,comp_pct,passing_yds,yds_per_att,adj_yds_per_att,passing_td,interceptions,pass_eff_rate
0,2018,Georgia,SEC,FR,QB,12,27,39,69.2,328,8.4,10.5,4,0,173.7
0,2019,Ohio State,Big Ten,SO,QB,14,238,354,67.2,3273,9.2,11.2,41,3,181.4
0,2020,Ohio State,Big Ten,JR,QB,8,158,225,70.2,2100,9.3,10.1,22,6,175.6
0,Career,Overall,Big Ten,JR,QB,34,423,618,68.4,5701,9.2,10.7,67,9,178.8


In [52]:
url = 'https://www.sports-reference.com/cfb/players/jaylen-waddle-1.html'
r = requests.get(url)
page = BeautifulSoup(r.content, 'html.parser')
tables = page.find_all('table')

In [53]:
rec_df= get_stats(tables, rec_columns)

  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))


In [54]:
rec_df

Unnamed: 0,year,school,conference,class,position,games,receptions,rec_yds,rec_avg,rec_td,rush_attempts,rush_yds,rush_avg,rush_td,total_plays,total_yds,total_avg,total_td
0,2018,Alabama,SEC,FR,WR,15,45,848,18.8,7,0,0,,0,45,848,18.8,7
0,2019,Alabama,SEC,SO,WR,13,33,560,17.0,6,1,5,5.0,0,34,565,16.6,6
0,2020,Alabama,SEC,JR,WR,6,28,591,21.1,4,3,12,4.0,0,31,603,19.5,4
0,Career,Alabama,SEC,JR,WR,34,106,1999,18.9,17,4,17,4.3,0,110,2016,18.3,17


In [55]:
rows = tables[0].find_all('tr')

In [56]:
rows[5]

<tr class="bold"><th class="right" data-stat="year_id" scope="row">Career</th><td class="left" data-stat="school_name"><a href="/cfb/schools/alabama/">Alabama</a></td><td class="left iz" data-stat="conf_abbr"></td><td class="center iz" data-stat="class"></td><td class="center iz" data-stat="pos"></td><td class="right iz" data-stat="g"></td><td class="right" data-stat="rec">106</td><td class="right" data-stat="rec_yds">1999</td><td class="right" data-stat="rec_yds_per_rec">18.9</td><td class="right" data-stat="rec_td">17</td><td class="right" data-stat="rush_att">4</td><td class="right" data-stat="rush_yds">17</td><td class="right" data-stat="rush_yds_per_att">4.3</td><td class="right iz" data-stat="rush_td">0</td><td class="right" data-stat="scrim_att">110</td><td class="right" data-stat="scrim_yds">2016</td><td class="right" data-stat="scrim_yds_per_att">18.3</td><td class="right" data-stat="scrim_td">17</td></tr>

In [57]:
url = 'http://www.sports-reference.com/cfb/players/kayvon-thibodeaux-1.html'
r = requests.get(url)
page = BeautifulSoup(r.content, 'html.parser')
tables = page.find_all('table')

In [58]:
def_columns = ['year', 'school', 'conference', 'class', 'position', 'games', 'solo_tackles', 'assisted_tackles', 'total_tackles', 'tackles_loss', 'sacks', 'interceptions', 'int_yards', 'int_yards_per_int', 'int_td', 'passes_defended', 'fumbles_recovered', 'fumble_yards', 'fumble_td', 'forced_fumbles']
punt_columns = ['year','school','conference','class','position','games','punt_returns','punt_ret_yds','punt_ret_avg','punt_td','kick_returns','kick_ret_yds','kick_ret_avg','kick_ret_td']
pass_columns = ['year','school','conference','class','position','games','completions','attempts','comp_pct','passing_yds','yds_per_att','adj_yds_per_att','passing_td','interceptions','pass_eff_rate']
rush_columns = ['year','school','conference','class','position','games','rush_attempts','rush_yds','rush_avg','rush_td','receptions','rec_yds','rec_avg','rec_td','total_plays','total_yds','total_avg','total_td']
rec_columns = ['year','school','conference','class','position','games','receptions','rec_yds','rec_avg','rec_td','rush_attempts','rush_yds','rush_avg','rush_td','total_plays','total_yds','total_avg','total_td']

In [59]:
def_df = get_stats(tables, def_columns)

  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))
  df = df.append(pd.DataFrame([stat_list], columns=columns))


In [60]:
def_df

Unnamed: 0,year,school,conference,class,position,games,solo_tackles,assisted_tackles,total_tackles,tackles_loss,sacks,interceptions,int_yards,int_yards_per_int,int_td,passes_defended,fumbles_recovered,fumble_yards,fumble_td,forced_fumbles
0,2019,Oregon,Pac-12,FR,DE,13,24,11,35,14.0,9.0,0,0,,0,3,0,,,1
0,2020,Oregon,Pac-12,SO,DE,7,25,17,42,9.5,3.0,0,0,,0,3,0,,,0
0,2021,Oregon,Pac-12,SO,DE,10,35,14,49,12.0,7.0,0,0,,0,1,0,,,2
0,Career,Oregon,Pac-12,SO,DE,30,84,42,126,35.5,19.0,0,0,,0,7,0,,,3
