In [25]:
from google.colab import files, drive
import pandas as pd
import io
import os

In [26]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path within Google Drive
folder_path = '/content/drive/My Drive/CFB_Model/Raw Data/schedule/'

# List all files in the folder
files = os.listdir(folder_path)

# Load all CSV files within the folder
dataframes = {}

for file_name in files:
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        dataframes[file_name] = pd.read_csv(file_path)

# Show the keys of the loaded DataFrames
print("Loaded dataframes:", dataframes.keys())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded dataframes: dict_keys(['2005_schedule.csv', '2006_schedule.csv', '2007_schedule.csv', '2008_schedule.csv', '2009_schedule.csv', '2010_schedule.csv', '2011_schedule.csv', '2012_schedule.csv', '2013_schedule.csv', '2014_schedule.csv', '2015_schedule.csv', '2016_schedule.csv', '2017_schedule.csv', '2018_schedule.csv', '2019_schedule.csv', '2020_schedule.csv', '2021_schedule.csv', '2022_schedule.csv', '2023_schedule.csv', '2024_schedule.csv'])


In [27]:
# prompt: do all dataframes have matching columns? If not, which ones have different columns and what are the missing/extra columns?

# Get a set of all unique columns across all DataFrames
all_columns = set()
for df in dataframes.values():
  all_columns.update(df.columns)

# Check each DataFrame for missing or extra columns
for year, df in dataframes.items():
  missing_cols = all_columns - set(df.columns)
  extra_cols = set(df.columns) - all_columns
  if missing_cols or extra_cols:
    print(f"DataFrame for year {year} has discrepancies:")
    if missing_cols:
      print(f"  Missing columns: {', '.join(missing_cols)}")
    if extra_cols:
      print(f"  Extra columns: {', '.join(extra_cols)}")
  else:
    print(f"DataFrame for year {year} has all expected columns.")


DataFrame for year 2005_schedule.csv has discrepancies:
  Missing columns: Away Line Scores, Home Line Scores
DataFrame for year 2006_schedule.csv has discrepancies:
  Missing columns: Away Line Scores, Home Line Scores
DataFrame for year 2007_schedule.csv has discrepancies:
  Missing columns: Away Line Scores, Home Line Scores
DataFrame for year 2008_schedule.csv has discrepancies:
  Missing columns: Away Line Scores, Home Line Scores
DataFrame for year 2009_schedule.csv has discrepancies:
  Missing columns: Away Line Scores, Home Line Scores
DataFrame for year 2010_schedule.csv has discrepancies:
  Missing columns: Away Line Scores, Home Line Scores
DataFrame for year 2011_schedule.csv has discrepancies:
  Missing columns: Away Line Scores, Home Line Scores
DataFrame for year 2012_schedule.csv has discrepancies:
  Missing columns: Away Line Scores, Home Line Scores
DataFrame for year 2013_schedule.csv has discrepancies:
  Missing columns: Away Line Scores, Home Line Scores
DataFrame 

In [28]:
# Columns to remove
columns_to_remove = ["Home Line Scores[3]", "Home Line Scores[2]", "Home Line Scores[1]",
                     "Home Line Scores[4]", "Home Line Scores[0]", "Away Line Scores[3]",
                     "Away Line Scores[1]", "Away Line Scores[4]", "Away Line Scores[0]",
                     "Away Line Scores[2]", "Away Line Scores", "Home Line Scores"]

# Remove columns from each DataFrame
for year, df in dataframes.items():
  for col in columns_to_remove:
    if col in df.columns:
      df.drop(col, axis=1, inplace=True)


In [29]:
# prompt: do all dataframes have matching columns? If not, which ones have different columns and what are the missing/extra columns?

# Get a set of all unique columns across all DataFrames
all_columns = set()
for df in dataframes.values():
  all_columns.update(df.columns)

# Check each DataFrame for missing or extra columns
for year, df in dataframes.items():
  missing_cols = all_columns - set(df.columns)
  extra_cols = set(df.columns) - all_columns
  if missing_cols or extra_cols:
    print(f"DataFrame for year {year} has discrepancies:")
    if missing_cols:
      print(f"  Missing columns: {', '.join(missing_cols)}")
    if extra_cols:
      print(f"  Extra columns: {', '.join(extra_cols)}")
  else:
    print(f"DataFrame for year {year} has all expected columns.")


DataFrame for year 2005_schedule.csv has all expected columns.
DataFrame for year 2006_schedule.csv has all expected columns.
DataFrame for year 2007_schedule.csv has all expected columns.
DataFrame for year 2008_schedule.csv has all expected columns.
DataFrame for year 2009_schedule.csv has all expected columns.
DataFrame for year 2010_schedule.csv has all expected columns.
DataFrame for year 2011_schedule.csv has all expected columns.
DataFrame for year 2012_schedule.csv has all expected columns.
DataFrame for year 2013_schedule.csv has all expected columns.
DataFrame for year 2014_schedule.csv has all expected columns.
DataFrame for year 2015_schedule.csv has all expected columns.
DataFrame for year 2016_schedule.csv has all expected columns.
DataFrame for year 2017_schedule.csv has all expected columns.
DataFrame for year 2018_schedule.csv has all expected columns.
DataFrame for year 2019_schedule.csv has all expected columns.
DataFrame for year 2020_schedule.csv has all expected c

In [30]:
# prompt: please concatenate all dataframes into a single dataframe

# Concatenate all DataFrames into a single DataFrame
all_years_df = pd.concat(dataframes.values(), ignore_index=True)
print(all_years_df.head())


          Id  Season  Week Season Type                Start Date  \
0  252440276    2005     1     regular  2005-09-01T23:00:00.000Z   
1  252442649    2005     1     regular  2005-09-01T23:00:00.000Z   
2  252440248    2005     1     regular  2005-09-01T23:00:00.000Z   
3  252442132    2005     1     regular  2005-09-01T23:00:00.000Z   
4  252440154    2005     1     regular  2005-09-01T23:00:00.000Z   

  Start Time Tbd  Completed  Neutral Site Conference Game  Attendance  ...  \
0          False       True         False           False     25102.0  ...   
1          False       True         False           False     20092.0  ...   
2            NaN       True         False           False         0.0  ...   
3            NaN       True         False           False         0.0  ...   
4            NaN       True         False           False         0.0  ...   

          Away Team Away Conference  Away Division Away Points  \
0    William & Mary     Atlantic 10            fcs      

In [31]:
# prompt: please list all the columns in the dataframe

# List all columns in the concatenated DataFrame
print(all_years_df.columns)


Index(['Id', 'Season', 'Week', 'Season Type', 'Start Date', 'Start Time Tbd',
       'Completed', 'Neutral Site', 'Conference Game', 'Attendance',
       'Venue Id', 'Venue', 'Home Id', 'Home Team', 'Home Conference',
       'Home Division', 'Home Points', 'Home Post Win Prob',
       'Home Pregame Elo', 'Home Postgame Elo', 'Away Id', 'Away Team',
       'Away Conference', 'Away Division', 'Away Points', 'Away Post Win Prob',
       'Away Pregame Elo', 'Away Postgame Elo', 'Excitement Index',
       'Highlights', 'Notes'],
      dtype='object')


In [32]:
# prompt: please drop the following columns: ['Id',  'Season Type', 'Start Date', 'Start Time Tbd',
#        'Completed', 'Attendance',
#        'Venue Id', 'Venue', 'Home Id',
#        'Home Division', 'Home Post Win Prob', 'Home Postgame Elo', 'Away Id', 'Away Division', 'Away Post Win Prob', 'Away Postgame Elo', 'Excitement Index',
#        'Highlights', 'Notes'

# Columns to drop
columns_to_drop = ['Id',  'Season Type', 'Start Date', 'Start Time Tbd',
       'Completed', 'Attendance',
       'Venue Id', 'Venue', 'Home Id',
       'Home Division', 'Home Post Win Prob', 'Home Postgame Elo', 'Away Id', 'Away Division', 'Away Post Win Prob', 'Away Postgame Elo', 'Excitement Index',
       'Highlights', 'Notes']

# Drop the columns from the concatenated DataFrame
all_years_df.drop(columns_to_drop, axis=1, inplace=True)

# Verify that the columns have been dropped
print(all_years_df.columns)


Index(['Season', 'Week', 'Neutral Site', 'Conference Game', 'Home Team',
       'Home Conference', 'Home Points', 'Home Pregame Elo', 'Away Team',
       'Away Conference', 'Away Points', 'Away Pregame Elo'],
      dtype='object')


In [33]:
# prompt: which columns have empty cells in them

# Check for empty cells in each column
empty_cells_by_column = all_years_df.isnull().sum()

# Print the results
print(empty_cells_by_column)


Season                 0
Week                   0
Neutral Site           0
Conference Game        2
Home Team              0
Home Conference        0
Home Points          865
Home Pregame Elo    2279
Away Team              0
Away Conference       10
Away Points          865
Away Pregame Elo    2578
dtype: int64


In [34]:
# prompt: which games are missing the conference game data? which games are missing the away conference? tell me the year, home team, away team, and week

# Filter the DataFrame for rows where 'Home Conference' is missing
missing_home_conf = all_years_df[all_years_df['Home Conference'].isnull()]

# Print the relevant columns for the games with missing 'Home Conference'
print(missing_home_conf[['Season', 'Week', 'Home Team', 'Away Team']])

# Filter the DataFrame for rows where 'Away Conference' is missing
missing_away_conf = all_years_df[all_years_df['Away Conference'].isnull()]

# Print the relevant columns for the games with missing 'Away Conference'
print(missing_away_conf[['Season', 'Week', 'Home Team', 'Away Team']])


Empty DataFrame
Columns: [Season, Week, Home Team, Away Team]
Index: []
      Season  Week         Home Team                    Away Team
1546    2007     2  Western Kentucky           West Virginia Tech
4711    2011     3          NC State                South Alabama
4772    2011     4        Kent State                South Alabama
5403    2012     2    UT San Antonio           Texas A&M-Commerce
5541    2012     4    UT San Antonio  Northwestern Oklahoma State
6173    2013     1     East Carolina                 Old Dominion
6226    2013     2          Maryland                 Old Dominion
6575    2013     8        Pittsburgh                 Old Dominion
6732    2013    11             Idaho                 Old Dominion
6811    2013    13    North Carolina                 Old Dominion


In [35]:
# prompt: for any team that is missing an away conference, please insert "FCS"

# Fill missing 'Away Conference' values with "FCS"
all_years_df['Away Conference'].fillna("FCS", inplace=True)

# Verify the changes
missing_away_conf = all_years_df[all_years_df['Away Conference'].isnull()]
print(missing_away_conf[['Season', 'Week', 'Home Team', 'Away Team']])


Empty DataFrame
Columns: [Season, Week, Home Team, Away Team]
Index: []


In [36]:
# prompt: which columns are missing data?

# Check for empty cells in each column
empty_cells_by_column = all_years_df.isnull().sum()

# Print the results
print(empty_cells_by_column[empty_cells_by_column > 0])


Conference Game        2
Home Points          865
Home Pregame Elo    2279
Away Points          865
Away Pregame Elo    2578
dtype: int64


In [37]:
# prompt: for any game missing the 'Conference Game', insert false

# Fill missing 'Conference Game' values with False
all_years_df['Conference Game'].fillna(False, inplace=True)

# Verify the changes
missing_conf_game = all_years_df[all_years_df['Conference Game'].isnull()]
print(missing_conf_game[['Season', 'Week', 'Home Team', 'Away Team']])


Empty DataFrame
Columns: [Season, Week, Home Team, Away Team]
Index: []


In [38]:
# prompt: which columns are missing data?

# Check for empty cells in each column
empty_cells_by_column = all_years_df.isnull().sum()

# Print the results
print(empty_cells_by_column[empty_cells_by_column > 0])

Home Points          865
Home Pregame Elo    2279
Away Points          865
Away Pregame Elo    2578
dtype: int64


In [39]:
# prompt: how many games are missing home and away points that arent from the 2024 year, and what games are they

# Filter the DataFrame for games missing points and not from 2024
missing_points_not_2024 = all_years_df[
    (all_years_df['Home Points'].isnull() | all_years_df['Away Points'].isnull()) &
    (all_years_df['Season'] != 2024)
]

# Count the number of such games
num_missing_points = missing_points_not_2024.shape[0]

# Print the number of games
print(f"Number of games missing points (excluding 2024): {num_missing_points}")

# Print the details of the games
print(missing_points_not_2024[['Season', 'Week', 'Home Team', 'Away Team']])


Number of games missing points (excluding 2024): 0
Empty DataFrame
Columns: [Season, Week, Home Team, Away Team]
Index: []


In [40]:
# prompt: please list any team that is missing a pregame elo and what seasons they have missing pregame elo stats in

# Filter for rows where either Home Pregame Elo or Away Pregame Elo is missing
missing_elo_df = all_years_df[all_years_df['Home Pregame Elo'].isnull() | all_years_df['Away Pregame Elo'].isnull()]

# Get unique team names with missing Elo (both home and away)
teams_missing_elo = set(missing_elo_df['Home Team'].unique()) | set(missing_elo_df['Away Team'].unique())

# For each team, find the seasons with missing Elo
for team in teams_missing_elo:
  seasons_missing_elo = missing_elo_df[(missing_elo_df['Home Team'] == team) | (missing_elo_df['Away Team'] == team)]['Season'].unique()
  print(f"Team: {team}, Seasons with missing Elo: {seasons_missing_elo}")


Team: Louisiana Tech, Seasons with missing Elo: [2006 2007 2008 2009 2010 2011 2013 2014 2015 2016 2017 2018 2019 2020
 2021 2022 2023 2024]
Team: UAB, Seasons with missing Elo: [2005 2007 2008 2012 2013 2014 2017 2018 2019 2020 2021 2022 2023 2024]
Team: Colorado State, Seasons with missing Elo: [2006 2008 2009 2011 2012 2013 2014 2015 2016 2017 2018 2019 2021 2022
 2023 2024]
Team: Central Michigan, Seasons with missing Elo: [2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2021
 2022 2023 2024]
Team: North Dakota, Seasons with missing Elo: [2009 2010 2011 2012 2014 2015 2016 2017 2018 2021 2022 2023 2024]
Team: Northern Illinois, Seasons with missing Elo: [2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2019
 2021 2022 2023 2024]
Team: San Diego State, Seasons with missing Elo: [2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
 2021 2022 2023 2024]
Team: Nevada, Seasons with missing Elo: [2007 2008 2010 2012 2013 2014 2015 2016 2

In [41]:
# prompt: drop the home pregame elo and the away pregame elo columns from the dataframe

# Drop the 'Home Pregame Elo' and 'Away Pregame Elo' columns
all_years_df.drop(['Home Pregame Elo', 'Away Pregame Elo'], axis=1, inplace=True)

# Verify that the columns have been dropped
print(all_years_df.columns)


Index(['Season', 'Week', 'Neutral Site', 'Conference Game', 'Home Team',
       'Home Conference', 'Home Points', 'Away Team', 'Away Conference',
       'Away Points'],
      dtype='object')


In [42]:
# prompt: which columns are missing data?

# Check for empty cells in each column
empty_cells_by_column = all_years_df.isnull().sum()

# Print the results
print(empty_cells_by_column[empty_cells_by_column > 0])

Home Points    865
Away Points    865
dtype: int64


In [43]:
# prompt: rename the Season column to Year

all_years_df.rename(columns={'Season': 'Year'}, inplace=True)

# Verify the change
print(all_years_df.columns)


Index(['Year', 'Week', 'Neutral Site', 'Conference Game', 'Home Team',
       'Home Conference', 'Home Points', 'Away Team', 'Away Conference',
       'Away Points'],
      dtype='object')


In [44]:
# prompt: Does "San José State" exist anywhere in the dataframe?

# Check if "San José State" exists in any column
san_jose_exists = all_years_df.apply(lambda x: x.astype(str).str.contains('San José State')).any().any()

# Print the result
print(f"'San José State' exists in the DataFrame: {san_jose_exists}")


'San José State' exists in the DataFrame: True


In [45]:
# prompt: Change every instance of "San José State" to "San Jose State"

# Replace "San José State" with "San Jose State" in all columns
all_years_df.replace("San José State", "San Jose State", inplace=True)

# Check if "San José State" still exists
san_jose_exists = all_years_df.apply(lambda x: x.astype(str).str.contains('San José State')).any().any()

# Print the result
print(f"'San José State' exists in the DataFrame: {san_jose_exists}")


'San José State' exists in the DataFrame: False


In [46]:
def add_bye_and_consecutive_columns(df):
    # Sort by Team, Year, and Week for correct ordering
    df = df.sort_values(by=['Home Team', 'Year', 'Week']).reset_index(drop=True)

    # Initialize columns
    df['Home Coming Off Bye'] = False
    df['Away Coming Off Bye'] = False
    df['Home Consecutive Games'] = 0
    df['Away Consecutive Games'] = 0
    df['Home Consecutive Away Games'] = 0
    df['Away Consecutive Away Games'] = 0

    # For each team, calculate the number of consecutive games played and bye week status
    for team_col in ['Home Team', 'Away Team']:
        bye_col = f'{team_col.split()[0]} Coming Off Bye'
        cons_games_col = f'{team_col.split()[0]} Consecutive Games'
        cons_away_col = f'{team_col.split()[0]} Consecutive Away Games'

        # Track the last game week and count consecutive games
        last_week = {}
        consecutive_games = {}
        consecutive_away_games = {}

        for idx, row in df.iterrows():
            team = row[team_col]
            year = row['Year']
            week = row['Week']
            is_away = (team_col == 'Away Team')

            # Initialize if first encounter
            if team not in last_week:
                last_week[team] = week - 1  # To start counting from week 1
                consecutive_games[team] = 0
                consecutive_away_games[team] = 0

            # Check if the team had a bye week
            if week - last_week[team] > 1:
                df.at[idx, bye_col] = True
                consecutive_games[team] = 0  # Reset consecutive games count

            # Increment consecutive games
            consecutive_games[team] += 1
            df.at[idx, cons_games_col] = consecutive_games[team]

            # Handle consecutive away games
            if is_away:
                consecutive_away_games[team] += 1
            else:
                consecutive_away_games[team] = 0  # Reset if not an away game

            df.at[idx, cons_away_col] = consecutive_away_games[team]

            # Update the last week
            last_week[team] = week

    return df

# Apply the function to your dataframe
all_years_df = add_bye_and_consecutive_columns(all_years_df)

In [48]:
# prompt: are there any NaN values in all_years_df? if so, what columns have NaN values?

# Check for NaN values in the DataFrame
nan_values_exist = all_years_df.isnull().values.any()

if nan_values_exist:
  # Get columns with NaN values
  nan_columns = all_years_df.columns[all_years_df.isnull().any()].tolist()
  print("Columns with NaN values:", nan_columns)
else:
  print("No NaN values found in the DataFrame.")


Columns with NaN values: ['Home Points', 'Away Points']


In [47]:
# Define the folder path in Google Drive where the file will be saved
folder_path = '/content/drive/My Drive/CFB_Model/PreProcessed Data/'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the CSV file directly to Google Drive
csv_file_path = os.path.join(folder_path, 'schedule_2005_2024.csv')
all_years_df.to_csv(csv_file_path, index=False)

print(f"File saved to: {csv_file_path}")

File saved to: /content/drive/My Drive/CFB_Model/PreProcessed Data/schedule_2005_2024.csv
