In [1]:
from google.colab import files, drive
import pandas as pd
import io
import os

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path within Google Drive
folder_path = '/content/drive/My Drive/CFB_Model/Raw Data/sp+/'

# List all files in the folder
files = os.listdir(folder_path)

# Load all CSV files within the folder
dataframes = {}

for file_name in files:
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        dataframes[file_name] = pd.read_csv(file_path)

# Show the keys of the loaded DataFrames
print("Loaded dataframes:", dataframes.keys())

Mounted at /content/drive
Loaded dataframes: dict_keys(['2005_sp.csv', '2006_sp.csv', '2007_sp.csv', '2008_sp.csv', '2009_sp.csv', '2010_sp.csv', '2011_sp.csv', '2012_sp.csv', '2013_sp.csv', '2014_sp.csv', '2015_sp.csv', '2016_sp.csv', '2017_sp.csv', '2018_sp.csv', '2019_sp.csv', '2020_sp.csv', '2021_sp.csv', '2022_sp.csv', '2023_sp.csv', '2024_sp.csv'])


In [3]:
# prompt: do all of the dataframes have matching columns, if not which ones are missing/extra in each dataframe

all_columns = set()
for df in dataframes.values():
  all_columns.update(df.columns)

for year, df in dataframes.items():
  missing_cols = all_columns - set(df.columns)
  extra_cols = set(df.columns) - all_columns
  if missing_cols:
    print(f"Dataframe for year {year} is missing columns: {missing_cols}")
  if extra_cols:
    print(f"Dataframe for year {year} has extra columns: {extra_cols}")
  if not missing_cols and not extra_cols:
    print(f"Dataframe for year {year} has all the expected columns.")


Dataframe for year 2005_sp.csv has all the expected columns.
Dataframe for year 2006_sp.csv has all the expected columns.
Dataframe for year 2007_sp.csv has all the expected columns.
Dataframe for year 2008_sp.csv has all the expected columns.
Dataframe for year 2009_sp.csv has all the expected columns.
Dataframe for year 2010_sp.csv has all the expected columns.
Dataframe for year 2011_sp.csv has all the expected columns.
Dataframe for year 2012_sp.csv has all the expected columns.
Dataframe for year 2013_sp.csv has all the expected columns.
Dataframe for year 2014_sp.csv has all the expected columns.
Dataframe for year 2015_sp.csv has all the expected columns.
Dataframe for year 2016_sp.csv has all the expected columns.
Dataframe for year 2017_sp.csv has all the expected columns.
Dataframe for year 2018_sp.csv has all the expected columns.
Dataframe for year 2019_sp.csv has all the expected columns.
Dataframe for year 2020_sp.csv has all the expected columns.
Dataframe for year 2021_

In [4]:
# prompt: concatenate all dataframes

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dataframes.values(), ignore_index=True)

# Display the combined dataframe
print(combined_df)


      Year              Team         Conference     Rating  Ranking  \
0     2005             Texas                SEC  35.300000      1.0   
1     2005               USC            Big Ten  34.700000      2.0   
2     2005        Ohio State            Big Ten  31.200000      3.0   
3     2005        Penn State            Big Ten  29.300000      4.0   
4     2005             Miami                ACC  26.000000      5.0   
...    ...               ...                ...        ...      ...   
2519  2024        New Mexico      Mountain West -21.100000    131.0   
2520  2024            Temple  American Athletic -21.800000    132.0   
2521  2024             Akron       Mid-American -22.400000    133.0   
2522  2024  Louisiana Monroe           Sun Belt -23.700000    134.0   
2523  2024  nationalAverages                NaN   0.015672      NaN   

      SecondOrderWins    Sos  Offense Ranking  Offense Rating  \
0                11.9  0.825              2.0       47.600000   
1                

In [6]:
# Drop rows where 'team' is 'nationalAverages'
combined_df = combined_df[combined_df['Team'] != 'nationalAverages']

# Display the combined dataframe
print(combined_df)


      Year              Team         Conference  Rating  Ranking  \
0     2005             Texas                SEC    35.3      1.0   
1     2005               USC            Big Ten    34.7      2.0   
2     2005        Ohio State            Big Ten    31.2      3.0   
3     2005        Penn State            Big Ten    29.3      4.0   
4     2005             Miami                ACC    26.0      5.0   
...    ...               ...                ...     ...      ...   
2518  2024    Kennesaw State     Conference USA   -20.9    130.0   
2519  2024        New Mexico      Mountain West   -21.1    131.0   
2520  2024            Temple  American Athletic   -21.8    132.0   
2521  2024             Akron       Mid-American   -22.4    133.0   
2522  2024  Louisiana Monroe           Sun Belt   -23.7    134.0   

      SecondOrderWins    Sos  Offense Ranking  Offense Rating  \
0                11.9  0.825              2.0            47.6   
1                12.2  0.839              1.0        

In [7]:
# prompt: which columns have empty data and what is the range of rows of empty data for each column?

empty_data_info = {}
for col in combined_df.columns:
  empty_rows = combined_df[combined_df[col].isnull()].index.tolist()
  if empty_rows:
    empty_data_info[col] = (len(empty_rows), min(empty_rows), max(empty_rows))

for col, (count, min_row, max_row) in empty_data_info.items():
  print(f"Column '{col}' has {count} empty rows (Range: {min_row} - {max_row})")


Column 'SecondOrderWins' has 788 empty rows (Range: 1730 - 2522)
Column 'Sos' has 788 empty rows (Range: 1730 - 2522)
Column 'Offense Success' has 788 empty rows (Range: 1730 - 2522)
Column 'Offense Explosiveness' has 788 empty rows (Range: 1730 - 2522)
Column 'Offense Rushing' has 788 empty rows (Range: 1730 - 2522)
Column 'Offense Passing' has 788 empty rows (Range: 1730 - 2522)
Column 'Offense StandardDowns' has 788 empty rows (Range: 1730 - 2522)
Column 'Offense PassingDowns' has 788 empty rows (Range: 1730 - 2522)
Column 'Offense RunRate' has 788 empty rows (Range: 1730 - 2522)
Column 'Offense Pace' has 788 empty rows (Range: 1730 - 2522)
Column 'Defense Success' has 788 empty rows (Range: 1730 - 2522)
Column 'Defense Explosiveness' has 788 empty rows (Range: 1730 - 2522)
Column 'Defense Rushing' has 788 empty rows (Range: 1730 - 2522)
Column 'Defense Passing' has 788 empty rows (Range: 1730 - 2522)
Column 'Defense StandardDowns' has 788 empty rows (Range: 1730 - 2522)
Column 'Def

In [8]:
# prompt: drop the 'SpecialTeams Rating' column

# Drop the 'SpecialTeams Rating' column
combined_df = combined_df.drop('SpecialTeams Rating', axis=1)

# Display the modified dataframe
print(combined_df)


      Year              Team         Conference  Rating  Ranking  \
0     2005             Texas                SEC    35.3      1.0   
1     2005               USC            Big Ten    34.7      2.0   
2     2005        Ohio State            Big Ten    31.2      3.0   
3     2005        Penn State            Big Ten    29.3      4.0   
4     2005             Miami                ACC    26.0      5.0   
...    ...               ...                ...     ...      ...   
2518  2024    Kennesaw State     Conference USA   -20.9    130.0   
2519  2024        New Mexico      Mountain West   -21.1    131.0   
2520  2024            Temple  American Athletic   -21.8    132.0   
2521  2024             Akron       Mid-American   -22.4    133.0   
2522  2024  Louisiana Monroe           Sun Belt   -23.7    134.0   

      SecondOrderWins    Sos  Offense Ranking  Offense Rating  \
0                11.9  0.825              2.0            47.6   
1                12.2  0.839              1.0        

In [9]:
# Define the folder path in Google Drive where the file will be saved
folder_path = '/content/drive/My Drive/CFB_Model/PreProcessed Data/'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the CSV file directly to Google Drive
csv_file_path = os.path.join(folder_path, 'sp_2005_2024.csv')
combined_df.to_csv(csv_file_path, index=False)

print(f"File saved to: {csv_file_path}")

File saved to: /content/drive/My Drive/CFB_Model/PreProcessed Data/sp_2005_2024.csv
