## Imports

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime

from selenium import webdriver

# Sports Reference College Football

# Selenium Solution

In [None]:
# Link to the main page
url = 'https://www.sports-reference.com/cfb/years/'

In [None]:
driver = webdriver.Chrome(executable_path="./chromedriver/macos/chromedriver")

In [None]:
driver.get(url)

In [None]:
# creating soup for the inital page documented in the "url" variable.
soup = BeautifulSoup(driver.page_source)

### Finding Years

In [None]:
# Finding the table on the page that contains all of the years in the NFL.
years_table = soup.find('table', {'id': 'years'})

In [None]:
# Obtaining each numerical year
years_list = []

for row in years_table.find_all('tr')[1:37]:
    if row.find('th').text == 'Year':
        pass
    else:
        year = row.find('th').text
        years_list.append(year)

This loop collects all of the years in the CFB that I would like to collect data/statistics from. This list dates back to 1985.
* The years before 1996 includes ties in the statistics. Overtime (OT) rules were changed in 1996, where multiple OTs were allowed until a winner is decided and this eliminated ties an an outcome. 
* Because of this there are 2 extra columns for years 1995 and older. I decided to scrape 2 different sets of data to eliminate stats being in the wrong columns.

In [None]:
len(years_list)

In [None]:
years_list[24]

In [None]:

# initializing list container for stand table
stand_list = []
# initializing list container for offense table
offense_list = []
# initializing list container for defense table
defense_list = []

start = datetime.now()

for year in years_list[19:]: # Looping through each year from current year (2019) to 1985.
    sleep(10)
    print(f"scrapping year: {year}")
    
    # Links to specific year for selenium to click on. Using the original url combined with the year to access the specific year.
    year_link = (url + year + '.html')
    standings_link = (url + year + '-standings.html')
    offense_link = (url + year + '-team-offense.html')
    defense_link = (url + year + '-team-defense.html')

    
    # going to current year Standings page
    driver.get(standings_link)
    
    # creating soup for current year in loop
    stand_soup = BeautifulSoup(driver.page_source)
    
    # Gather standings stats for each team
    # Collect Standings Table
    stand_table = stand_soup.find('table', {'id': 'standings'})

    # header names
    cols = []
    for i in stand_table.find_all('tr')[1]:  
        for a in i:
            if a == '\n':
                continue
            else:
                cols.append(a)
    cols.append("year")
    
    

    # Iterating through rows
    for i in stand_table.find_all('tr'):
        # Empty team container
        team = []

        # finding and adding the team ranking to the list
        rank = i.find('th')
        team.append(rank.text)

        # Finding rest of the team data with 'td' attribute
        row = i.find_all('td')

        # Iterating through columns
        if len(row) == 0:
            continue
        else:
            for a in row:
                team.append(a.text)
            team.append(year)
        stand_list.append(team)
     
    print('got standings data')
    sleep(2)



    # going to current year offensive page
    driver.get(offense_link)
    
    # creating offense soup for current year in loop
    offense_soup = BeautifulSoup(driver.page_source)
    
    # Gather offensive stats for each team
    # Collect Offense Table
    offense_table = offense_soup.find('table', {'id': 'offense'})

    # header names: Offense
    if offense_table:
        o_cols = []
        for i in offense_table.find_all('tr')[1]:  
            for a in i:
                if a == '\n':
                    continue
                else:
                    o_cols.append(a)
        o_cols.append("year")
    else:
        continue
    
    
    # Iterating through rows: Offense
    if offense_table:
        for i in offense_table.find_all('tr'):
            # Empty team container
            team = []

            # finding and adding the team ranking to the list
            rank = i.find('th')
            team.append(rank.text)

            # Finding rest of the team data with 'td' attribute
            row = i.find_all('td')

            # Iterating through columns
            if len(row) == 0:
                continue
            else:
                for a in row:
                    team.append(a.text)
                team.append(year)
            offense_list.append(team)
        print('got offense data')
    else:
        print("There is no offensive table this year")   
        
    sleep(2)


 
    # going to current year defensive page
    driver.get(defense_link)
    
    # creating defense soup for current year in loop
    defense_soup = BeautifulSoup(driver.page_source)
    
    
    # Gather defensive stats for each team
    # Collect Defense Table
    defense_table = defense_soup.find('table', {'id': 'defense'})
    
    # header names: Defense
    if defense_table:
        d_cols = []
        for i in defense_table.find_all('tr')[1]:  
            for a in i:
                if a == '\n':
                    continue
                else:
                    d_cols.append(a)
        d_cols.append("year")
    else:
        continue
    
    # Iterating through rows: Defense
    if defense_table:
        for i in defense_table.find_all('tr'):
            # Empty team container
            team = []

            # finding and adding the team ranking to the list
            rank = i.find('th')
            team.append(rank.text)

            # Finding rest of the team data with 'td' attribute
            row = i.find_all('td')

            # Iterating through columns
            if len(row) == 0:
                continue
            else:
                for a in row:
                    team.append(a.text)
                team.append(year)
            defense_list.append(team)
        print('got defense data')
    else:
        print("There is no defensive table this year")
    

    sleep(2)


# creating DataFrames
stand_df = pd.DataFrame(stand_list, columns=cols)
offense_df = pd.DataFrame(offense_list, columns=o_cols)
defense_df = pd.DataFrame(defense_list, columns=d_cols)

stop = datetime.now()
print(f'The total time passed during scrape (hh:mm:ss:ms) is: {stop - start}')

driver.quit()

In [None]:
print(stand_df.shape)
stand_df

In [None]:
print(offense_df.shape)
offense_df

In [None]:
offense_df.columns

In [None]:
o_columns = ['off_Rk', 'School', 'G', 'Pts', 'pass_cmp', 'pass_Att', 'cmp_Pct', 'pass_Yds', 'pass_TD', 'rush_Att',
       'rush_Yds', 'rush_Avg', 'rush_TD', 'Plays', 'Yds', 'Avg_Yds', '1st_down_Pass', '1st_down_Rush', '1st_down_Pen', '1st_down_Tot',
       'pen_No.', 'pen_Yds', 'Fum', 'Int', 'TO_Tot', 'year']

offense_df.columns = o_columns

offense_df.columns

In [None]:
print(defense_df.shape)
defense_df

In [None]:
defense_df.columns

In [None]:
d_columns = ['def_Rk', 'School', 'G', 'opp_Pts', 'opp_cmp', 'opp_pass_Att', 'opp_cmp_Pct', 'opp_ass_Yds', 'opp_pass_TD', 'opp_rush_Att',
       'opp_rush_Yds', 'opp_rush_Avg', 'opp_rush_TD', 'opp_Plays', 'opp_Yds', 'opp_Avg_Yds', 'opp_1st_down_Pass', 'opp_1st_down_Rush', 'opp_1st_down_Pen', 'opp_1st_down_Tot',
       'opp_pen_No.', 'opp_pen_Yds', 'opp_Fum', 'opp_Int', 'opp_TO_Tot', 'year']

defense_df.columns = d_columns

defense_df.columns

## Exports

In [None]:
stand_df.to_csv('../data/college_standings_1995_1985.csv', index=False)

In [None]:
offense_df.to_csv('../data/college_offense_stats_1995_1985.csv', index=False)

In [None]:
defense_df.to_csv('../data/college_defense_stats_1995_1985.csv', index=False)