## Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime

from selenium import webdriver

# Sports Reference College Football

# Selenium Solution

In [2]:
# Link to the main page
url = 'https://www.sports-reference.com/cfb/years/'

In [3]:
driver = webdriver.Chrome(executable_path="./chromedriver/macos/chromedriver")

In [4]:
driver.get(url)

In [5]:
# creating soup for the inital page documented in the "url" variable.
soup = BeautifulSoup(driver.page_source)

### Finding Years

In [6]:
# Finding the table on the page that contains all of the years in the NFL.
years_table = soup.find('table', {'id': 'years'})

In [7]:
# Obtaining each numerical year
years_list = []

for row in years_table.find_all('tr')[1:37]:
    if row.find('th').text == 'Year':
        pass
    else:
        year = row.find('th').text
        years_list.append(year)

This loop collects all of the years in the CFB I would like to collect data from. Dating back to 1985.

In [8]:
len(years_list)

36

In [9]:
years_list[34]

'1985'

In [10]:

# initializing list container for stand table
stand_list = []
# initializing list container for offense table
offense_list = []
# initializing list container for defense table
defense_list = []

start = datetime.now()

for year in years_list: # Looping through each year from current year (2019) to 1985.
    sleep(10)
    print(f"scrapping year: {year}")
    
    # Links to specific year for selenium to click on. Using the original url combined with the year to access the specific year.
    year_link = (url + year + '.html')
    standings_link = (url + year + '-standings.html')
    offense_link = (url + year + '-team-offense.html')
    defense_link = (url + year + '-team-defense.html')

    
    # going to current year Standings page
    driver.get(standings_link)
    
    # creating soup for current year in loop
    stand_soup = BeautifulSoup(driver.page_source)
    
    # Gather standings stats for each team
    # Collect Standings Table
    stand_table = stand_soup.find('table', {'id': 'standings'})

    # header names
    cols = []
    for i in stand_table.find_all('tr')[1]:  
        for a in i:
            if a == '\n':
                continue
            else:
                cols.append(a)
    cols.append("year")
    
    

    # Iterating through rows
    for i in stand_table.find_all('tr'):
        # Empty team container
        team = []

        # finding and adding the team ranking to the list
        rank = i.find('th')
        team.append(rank.text)

        # Finding rest of the team data with 'td' attribute
        row = i.find_all('td')

        # Iterating through columns
        if len(row) == 0:
            continue
        else:
            for a in row:
                team.append(a.text)
        team.append(year)
        stand_list.append(team)
     
    print('got standings data')
    sleep(2)



    # going to current year offensive page
    driver.get(offense_link)
    
    # creating offense soup for current year in loop
    offense_soup = BeautifulSoup(driver.page_source)
    
    # Gather offensive stats for each team
    # Collect Offense Table
    offense_table = offense_soup.find('table', {'id': 'offense'})

    # header names: Offense
    if offense_table:
        o_cols = []
        for i in offense_table.find_all('tr')[1]:  
            for a in i:
                if a == '\n':
                    continue
                else:
                    o_cols.append(a)
        o_cols.append("year")
    else:
        continue
    
    
    # Iterating through rows: Offense
    if offense_table:
        for i in offense_table.find_all('tr'):
            # Empty team container
            team = []

            # finding and adding the team ranking to the list
            rank = i.find('th')
            team.append(rank.text)

            # Finding rest of the team data with 'td' attribute
            row = i.find_all('td')

            # Iterating through columns
            if len(row) == 0:
                continue
            else:
                for a in row:
                    team.append(a.text)
            team.append(year)
            offense_list.append(team)
        print('got offense data')
    else:
        print("There is no offensive table this year")   
        
    sleep(2)


 
    # going to current year defensive page
    driver.get(defense_link)
    
    # creating defense soup for current year in loop
    defense_soup = BeautifulSoup(driver.page_source)
    
    
    # Gather defensive stats for each team
    # Collect Defense Table
    defense_table = defense_soup.find('table', {'id': 'defense'})
    
    # header names: Defense
    if defense_table:
        d_cols = []
        for i in defense_table.find_all('tr')[1]:  
            for a in i:
                if a == '\n':
                    continue
                else:
                    d_cols.append(a)
        d_cols.append("year")
    else:
        continue
    
    # Iterating through rows: Defense
    if defense_table:
        for i in defense_table.find_all('tr'):
            # Empty team container
            team = []

            # finding and adding the team ranking to the list
            rank = i.find('th')
            team.append(rank.text)

            # Finding rest of the team data with 'td' attribute
            row = i.find_all('td')

            # Iterating through columns
            if len(row) == 0:
                continue
            else:
                for a in row:
                    team.append(a.text)
            team.append(year)
            defense_list.append(team)
        print('got defense data')
    else:
        print("There is no defensive table this year")
    

    sleep(2)


# creating DataFrames
stand_df = pd.DataFrame(stand_list, columns=cols)
offense_df = pd.DataFrame(offense_list, columns=o_cols)
defense_df = pd.DataFrame(defense_list, columns=d_cols)

stop = datetime.now()
print(f'The total time passed during scrape (hh:mm:ss:ms) is: {stop - start}')

driver.quit()

scrapping year: 2019
got standings data
got offense data
got defense data
scrapping year: 2018
got standings data
got offense data
got defense data
scrapping year: 2017
got standings data
got offense data
got defense data
scrapping year: 2016
got standings data
got offense data
got defense data
scrapping year: 2015
got standings data
got offense data
got defense data
scrapping year: 2014
got standings data
got offense data
got defense data
scrapping year: 2013
got standings data
got offense data
got defense data
scrapping year: 2012
got standings data
got offense data
got defense data
scrapping year: 2011
got standings data
got offense data
got defense data
scrapping year: 2010
got standings data
got offense data
got defense data
scrapping year: 2009
got standings data
got offense data
got defense data
scrapping year: 2008
got standings data
got offense data
got defense data
scrapping year: 2007
got standings data
got offense data
got defense data
scrapping year: 2006
got standings dat

In [11]:
print(stand_df.shape)
stand_df.tail()

(4179, 20)


Unnamed: 0,Rk,School,Conf,W,L,T,Pct,W.1,L.1,T.1,Pct.1,Off,Def,SRS,SOS,AP Pre,AP High,AP Rank,Notes,year
4174,106,Wyoming,WAC,6,6,0,0.5,4,4,0,0.5,27.8,28.5,-2.35,-1.1,,,,,1984
4175,107,San Diego State,WAC,4,7,1,0.375,4,3,1,0.563,22.1,20.8,0.0,1.41,,,,record adjusted to 5-6-1 by NCAA,1984
4176,108,Colorado State,WAC,3,8,0,0.273,3,5,0,0.375,20.9,32.7,-6.65,1.62,,,,,1984
4177,109,New Mexico,WAC,4,8,0,0.333,1,7,0,0.125,20.9,29.9,-8.41,-2.66,,,,,1984
4178,110,UTEP,WAC,2,9,0,0.182,1,7,0,0.125,16.4,34.0,-15.04,-1.5,,,,,1984


In [12]:
print(offense_df.shape)
offense_df.tail(5)

(2448, 26)


Unnamed: 0,Rk,School,G,Pts,Cmp,Att,Pct,Yds,TD,Att.1,...,Pass,Rush,Pen,Tot,No.,Yds.1,Fum,Int,Tot.1,year
2443,112,Duke,11,14.1,17.7,36.5,48.6,198.5,0.7,34.1,...,8.7,5.5,2.2,16.5,9.5,74.3,0.5,2.0,2.5,2000
2444,113,Baylor,11,12.6,13.2,32.6,40.4,148.9,0.9,32.7,...,6.3,4.7,3.1,14.1,6.4,56.1,1.2,1.8,3.0,2000
2445,114,Central Michigan,11,12.5,15.3,29.8,51.2,179.1,0.9,35.4,...,7.7,5.3,1.3,14.3,4.9,37.6,1.3,0.6,1.9,2000
2446,115,Kent State,11,11.6,15.8,30.9,51.2,156.4,0.7,36.7,...,7.3,7.0,1.4,15.6,8.0,57.2,0.7,1.1,1.8,2000
2447,116,Louisiana-Monroe,11,8.7,19.7,36.7,53.7,184.8,0.5,29.4,...,7.7,4.5,2.0,14.3,6.5,54.3,1.4,1.3,2.6,2000


In [13]:
print(defense_df.shape)
defense_df.tail(5)

(2448, 26)


Unnamed: 0,Rk,School,G,Pts,Cmp,Att,Pct,Yds,TD,Att.1,...,Pass,Rush,Pen,Tot,No.,Yds.1,Fum,Int,TO,year
2443,112,Louisiana-Monroe,11,37.7,14.4,25.4,56.6,189.9,2.1,40.9,...,8.5,8.2,1.3,17.9,7.1,61.5,0.6,0.5,1.1,2000
2444,113,Nevada,12,38.7,15.7,27.8,56.5,201.6,1.3,46.1,...,8.8,12.1,1.9,22.8,7.8,75.5,1.1,1.1,2.2,2000
2445,114,Indiana,11,38.8,21.8,33.7,64.7,270.9,2.3,39.3,...,11.8,11.2,1.5,24.5,7.8,63.5,0.9,0.4,1.3,2000
2446,115,Duke,11,39.1,19.9,31.3,63.7,272.4,1.9,39.9,...,11.9,9.0,1.4,22.3,9.5,83.3,0.6,0.7,1.4,2000
2447,116,Buffalo,11,41.1,14.1,24.6,57.2,216.9,2.2,47.6,...,8.8,11.8,1.5,22.1,7.2,60.8,0.5,1.0,1.5,2000


## Exports

In [14]:
stand_df.to_csv('../data/college_standings.csv', index=False)

In [15]:
offense_df.to_csv('../data/college_offense_stats.csv', index=False)

In [16]:
defense_df.to_csv('../data/college_defense_stats.csv', index=False)

In [17]:
# container = []

# # Iterating through rows
# for i in stand_table.find_all('tr')[:50]:
#     # Empty team container
#     team = []
    
#     # finding and adding the team ranking to the list
#     rank = i.find('th')
#     team.append(rank.text)
    
#     # Finding rest of the team data with 'td' attribute
#     row = i.find_all('td')
    
#     # Iterating through columns
#     if len(row) == 0:
#         continue
#     else:
#         for a in row:
#             team.append(a.text)
#     container.append(team)