In [1]:
# prompt: i need to import a csv file from my computer with college football recruiting data from 2001 to 2024

from google.colab import files, drive
import pandas as pd
import io
import os

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path within Google Drive
folder_path = '/content/drive/My Drive/CFB_Model/Raw Data/recruiting/'

# List all files in the folder
files = os.listdir(folder_path)

# Load all CSV files within the folder
dataframes = {}

for file_name in files:
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        dataframes[file_name] = pd.read_csv(file_path)

# Show the keys of the loaded DataFrames
print("Loaded dataframes:", dataframes.keys())

Mounted at /content/drive
Loaded dataframes: dict_keys(['2001_recruiting.csv', '2002_recruiting.csv', '2003_recruiting.csv', '2004_recruiting.csv', '2005_recruiting.csv', '2006_recruiting.csv', '2007_recruiting.csv', '2008_recruiting.csv', '2009_recruiting.csv', '2010_recruiting.csv', '2011_recruiting.csv', '2012_recruiting.csv', '2013_recruiting.csv', '2014_recruiting.csv', '2015_recruiting.csv', '2016_recruiting.csv', '2017_recruiting.csv', '2018_recruiting.csv', '2019_recruiting.csv', '2020_recruiting.csv', '2021_recruiting.csv', '2022_recruiting.csv', '2023_recruiting.csv', '2024_recruiting.csv'])


In [20]:
# prompt: concatenate all dataframes

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dataframes.values(), ignore_index=True)

# Display the combined DataFrame
print(df)


      Year  Rank                Team  Points
0     2001     1       Florida State  256.86
1     2001     2                 LSU  206.53
2     2001     3            Michigan  204.24
3     2001     4           Tennessee  184.16
4     2001     5            Oklahoma  155.82
...    ...   ...                 ...     ...
4029  2024   196             Harvard   13.44
4030  2024   197           Villanova   12.50
4031  2024   198  Grand Valley State   11.70
4032  2024   200   Northern Colorado   11.06
4033  2024   201          New Mexico   10.22

[4034 rows x 4 columns]


In [21]:
# Step 1: Identify unique teams and years in the dataset
unique_teams = df['Team'].unique()
unique_years = df['Year'].unique()

# Step 2: Create a complete grid of teams and years
teams_years_grid = pd.MultiIndex.from_product([unique_teams, range(min(unique_years), max(unique_years)+1)], names=['Team', 'Year'])

# Step 3: Reindex the original DataFrame using the complete grid
df_full = df.set_index(['Team', 'Year']).reindex(teams_years_grid).reset_index()

# Now df_full will have a row for each team for every year in the range, with NaNs where data was missing
print(df_full)

               Team  Year   Rank  Points
0     Florida State  2001    1.0  256.86
1     Florida State  2002    2.0  303.14
2     Florida State  2003   15.0  203.95
3     Florida State  2004    1.0  292.97
4     Florida State  2005    2.0  281.52
...             ...   ...    ...     ...
6115      Utah Tech  2020    NaN     NaN
6116      Utah Tech  2021    NaN     NaN
6117      Utah Tech  2022    NaN     NaN
6118      Utah Tech  2023  150.0   24.55
6119      Utah Tech  2024  138.0   56.67

[6120 rows x 4 columns]


In [22]:
# Forward fill followed by backward fill, resetting the index to align with the original DataFrame
df_full['Points'] = df_full.groupby('Team')['Points'].ffill().bfill().reset_index(drop=True)

# Interpolation for missing values within the range, ensuring index alignment
#df_full['Points'] = df_full.groupby('Team')['Points'].apply(lambda x: x.interpolate(method='linear')).reset_index(drop=True)

# Rolling window with centered mean for smoothing, ensuring index alignment
#df_full['Points'] = df_full.groupby('Team')['Points'].apply(lambda x: x.fillna(x.rolling(window=3, center=True, min_periods=1).mean())).reset_index(drop=True)

In [23]:
# Forward fill followed by backward fill, resetting the index to align with the original DataFrame
df_full['Rank'] = df_full.groupby('Team')['Rank'].ffill().bfill().reset_index(drop=True)

# Interpolation for missing values within the range, ensuring index alignment
#df_full['Rank'] = df_full.groupby('Team')['Rank'].apply(lambda x: x.interpolate(method='linear')).reset_index(drop=True)

# Rolling window with centered mean for smoothing, ensuring index alignment
#df_full['Rank'] = df_full.groupby('Team')['Rank'].apply(lambda x: x.fillna(x.rolling(window=3, center=True, min_periods=1).mean())).reset_index(drop=True)

In [24]:
# Assuming your DataFrame has columns like 'Team', 'Year', and 'Points'
df_full['Composite Rating'] = df_full.groupby('Team')['Points'].rolling(4).sum().reset_index(0, drop=True)
print(df)


      Year  Rank                Team  Points
0     2001     1       Florida State  256.86
1     2001     2                 LSU  206.53
2     2001     3            Michigan  204.24
3     2001     4           Tennessee  184.16
4     2001     5            Oklahoma  155.82
...    ...   ...                 ...     ...
4029  2024   196             Harvard   13.44
4030  2024   197           Villanova   12.50
4031  2024   198  Grand Valley State   11.70
4032  2024   200   Northern Colorado   11.06
4033  2024   201          New Mexico   10.22

[4034 rows x 4 columns]


In [25]:
df_full['Average Rank'] = df_full.groupby('Team')['Rank'].rolling(4).mean().reset_index(0, drop=True)
print(df_full)


               Team  Year   Rank  Points  Composite Rating  Average Rank
0     Florida State  2001    1.0  256.86               NaN           NaN
1     Florida State  2002    2.0  303.14               NaN           NaN
2     Florida State  2003   15.0  203.95               NaN           NaN
3     Florida State  2004    1.0  292.97           1056.92          4.75
4     Florida State  2005    2.0  281.52           1081.58          5.00
...             ...   ...    ...     ...               ...           ...
6115      Utah Tech  2020  150.0   24.55             98.20        150.00
6116      Utah Tech  2021  150.0   24.55             98.20        150.00
6117      Utah Tech  2022  150.0   24.55             98.20        150.00
6118      Utah Tech  2023  150.0   24.55             98.20        150.00
6119      Utah Tech  2024  138.0   56.67            130.32        147.00

[6120 rows x 6 columns]


In [26]:
# prompt: what are the values for each team in 2024 sorted by Composite Rating

df_2024 = df_full[df_full['Year'] == 2024].sort_values('Composite Rating', ascending=False)
print(df_2024[['Team', 'Composite Rating']])


                     Team  Composite Rating
671               Alabama           1288.91
239               Georgia           1245.62
143            Ohio State           1188.55
743             Texas A&M           1132.51
263                 Texas           1119.36
...                   ...               ...
5951  Central Connecticut             26.68
5759               Marist             25.20
5903       Morehead State             23.20
4031               Wagner             19.88
5975                Drake              3.88

[255 rows x 2 columns]


In [29]:
# Drop data before 2005
df = df_full[df_full['Year'] >= 2005]
print(df)


               Team  Year   Rank  Points  Composite Rating  Average Rank
4     Florida State  2005    2.0  281.52           1081.58          5.00
5     Florida State  2006    4.0  286.53           1064.97          5.50
6     Florida State  2007   20.0  216.82           1077.84          6.75
7     Florida State  2008   12.0  259.98           1044.85          9.50
8     Florida State  2009   11.0  259.63           1022.96         11.75
...             ...   ...    ...     ...               ...           ...
6115      Utah Tech  2020  150.0   24.55             98.20        150.00
6116      Utah Tech  2021  150.0   24.55             98.20        150.00
6117      Utah Tech  2022  150.0   24.55             98.20        150.00
6118      Utah Tech  2023  150.0   24.55             98.20        150.00
6119      Utah Tech  2024  138.0   56.67            130.32        147.00

[5100 rows x 6 columns]


In [30]:
# Define the folder path in Google Drive where the file will be saved
folder_path = '/content/drive/My Drive/CFB_Model/PreProcessed Data/'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the CSV file directly to Google Drive
csv_file_path = os.path.join(folder_path, 'recruiting_2005_2024.csv')
df.to_csv(csv_file_path, index=False)

print(f"File saved to: {csv_file_path}")

File saved to: /content/drive/My Drive/CFB_Model/PreProcessed Data/recruiting_2005_2024.csv
