In [1]:
# prompt: i need to import a csv file from my computer with college football recruiting data from 2001 to 2024

from google.colab import files, drive
import pandas as pd
import io
import os

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path within Google Drive
folder_path = '/content/drive/My Drive/CFB_Model/Raw Data/coaching/'

# List all files in the folder
files = os.listdir(folder_path)

# Load all CSV files within the folder
dataframes = {}

for file_name in files:
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        dataframes[file_name] = pd.read_csv(file_path)

# Show the keys of the loaded DataFrames
print("Loaded dataframes:", dataframes.keys())

Mounted at /content/drive
Loaded dataframes: dict_keys(['coaching_2001_2025.csv'])


In [3]:
# prompt: concatenate all dataframes

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dataframes.values(), ignore_index=True)

# Display the combined DataFrame
print(df)


              School  Year  Games  Wins  Losses  Ties  Preseason Rank  \
0             Temple  2011     13     9       4     0             NaN   
1             Temple  2012     11     4       7     0             NaN   
2     Boston College  2013     13     7       6     0             NaN   
3     Boston College  2014     13     7       6     0             NaN   
4     Boston College  2015     12     3       9     0             NaN   
...              ...   ...    ...   ...     ...   ...             ...   
2970        Illinois  2007     13     9       4     0             NaN   
2971        Illinois  2008     12     5       7     0            20.0   
2972        Illinois  2009     12     3       9     0             NaN   
2973        Illinois  2010     13     7       6     0             NaN   
2974        Illinois  2011     12     6       6     0             NaN   

      Postseason Rank  Srs  Sp Overall  Sp Offense  Sp Defense First Name  \
0                 NaN  5.4         4.6        

In [4]:
# Rename 'Team' to 'School' to match your column naming preference
df.rename(columns={'School': 'Team'}, inplace=True)

# Assuming df is your DataFrame
df_filtered = df[['Team', 'Year', 'Wins', 'Losses', 'First Name', 'Last Name']]

# Check the result
print(df_filtered.head())

             Team  Year  Wins  Losses First Name Last Name
0          Temple  2011     9       4      Steve   Addazio
1          Temple  2012     4       7      Steve   Addazio
2  Boston College  2013     7       6      Steve   Addazio
3  Boston College  2014     7       6      Steve   Addazio
4  Boston College  2015     3       9      Steve   Addazio


In [5]:
# 1. Calculate the Win Percentage for Each Team
df_filtered['Win Percentage'] = df_filtered['Wins'] / (df_filtered['Wins'] + df_filtered['Losses'])

# 2. Calculate the Average Win Percentage for Each Coach
coach_win_percentage = df_filtered.groupby(['First Name', 'Last Name'])['Win Percentage'].mean().reset_index()

# Rename columns for clarity
coach_win_percentage.columns = ['First Name', 'Last Name', 'Coach Win Percentage']

print(coach_win_percentage)

    First Name   Last Name  Coach Win Percentage
0           Al      Golden              0.500092
1           Al        Groh              0.522283
2         Alan       Gooch              0.000000
3         Alex      Golesh              0.538462
4         Alex      Miller              0.000000
..         ...         ...                   ...
494       Will    Muschamp              0.508307
495     Willie       Fritz              0.563919
496     Willie     Taggart              0.466963
497      Woody  Widenhofer              0.181818
498       Zach      Arnett              0.700000

[499 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Win Percentage'] = df_filtered['Wins'] / (df_filtered['Wins'] + df_filtered['Losses'])


In [6]:
# Merge the average win percentage back into the original dataframe
df_filtered = pd.merge(df_filtered, coach_win_percentage, on=['First Name', 'Last Name'])

In [7]:
df_filtered.head()

Unnamed: 0,Team,Year,Wins,Losses,First Name,Last Name,Win Percentage,Coach Win Percentage
0,Temple,2011,9,4,Steve,Addazio,0.692308,0.458466
1,Temple,2012,4,7,Steve,Addazio,0.363636,0.458466
2,Boston College,2013,7,6,Steve,Addazio,0.538462,0.458466
3,Boston College,2014,7,6,Steve,Addazio,0.538462,0.458466
4,Boston College,2015,3,9,Steve,Addazio,0.25,0.458466


In [8]:
# Extract unique teams and years
teams = df_filtered['Team'].unique()
years = df_filtered['Year'].unique()

In [9]:
# Create a DataFrame with all combinations of teams and years
complete_grid = pd.MultiIndex.from_product([teams, years], names=['Team', 'Year']).to_frame(index=False)

In [10]:
# Merge the complete grid with the original dataframe
df_complete = pd.merge(complete_grid, df_filtered, on=['Team', 'Year'], how='left')

# Fill missing values if needed
# You can fill missing values with NaN, zero, or another appropriate value
# df_complete.fillna(value={'Wins': 0, 'Losses': 0, 'First Name': 'Unknown', 'Last Name': 'Unknown'}, inplace=True)

print(df_complete)

               Team  Year  Wins  Losses First Name Last Name  Win Percentage  \
0            Temple  2011   9.0     4.0      Steve   Addazio        0.692308   
1            Temple  2012   4.0     7.0      Steve   Addazio        0.363636   
2            Temple  2013   2.0    10.0       Matt     Rhule        0.166667   
3            Temple  2014   6.0     6.0       Matt     Rhule        0.500000   
4            Temple  2015  10.0     4.0       Matt     Rhule        0.714286   
...             ...   ...   ...     ...        ...       ...             ...   
3227  SOUTH FLORIDA  2006   NaN     NaN        NaN       NaN             NaN   
3228  SOUTH FLORIDA  2007   NaN     NaN        NaN       NaN             NaN   
3229  SOUTH FLORIDA  2008   NaN     NaN        NaN       NaN             NaN   
3230  SOUTH FLORIDA  2009   NaN     NaN        NaN       NaN             NaN   
3231  SOUTH FLORIDA  2010   NaN     NaN        NaN       NaN             NaN   

      Coach Win Percentage  
0         

In [11]:
# Get unique teams and sort them alphabetically
unique_teams_sorted = sorted(df_filtered['Team'].unique())

# Print the sorted list of teams
print("List of all teams (sorted alphabetically):")
for team in unique_teams_sorted:
    print(team)

List of all teams (sorted alphabetically):
Air Force
Akron
Alabama
Appalachian State
Arizona
Arizona State
Arkansas
Arkansas State
Army
Auburn
BYU
Ball State
Baylor
Boise State
Boston College
Bowling Green
Buffalo
California
Central Michigan
Charlotte
Cincinnati
Clemson
Coastal Carolina
Colorado
Colorado State
Connecticut
Duke
East Carolina
Eastern Michigan
Florida
Florida Atlantic
Florida International
Florida State
Fresno State
Georgia
Georgia Southern
Georgia State
Georgia Tech
Hawai'i
Houston
Idaho
Illinois
Indiana
Iowa
Iowa State
Jacksonville State
James Madison
Kansas
Kansas State
Kent State
Kentucky
LSU
Liberty
Louisiana
Louisiana Monroe
Louisiana Tech
Louisville
Marshall
Maryland
Memphis
Miami
Miami (OH)
Michigan
Michigan State
Middle Tennessee
Minnesota
Mississippi State
Missouri
NC State
Navy
Nebraska
Nevada
New Mexico
New Mexico State
North Carolina
North Texas
Northern Illinois
Northwestern
Notre Dame
Ohio
Ohio State
Oklahoma
Oklahoma State
Old Dominion
Ole Miss
Oregon
Oreg

In [12]:
# List of all years
all_years = range(df_filtered['Year'].min(), df_filtered['Year'].max() + 1)

# Get unique teams already in the dataset
existing_teams = df_filtered['Team'].unique()

# Define the missing team
missing_team = "Kennesaw State"

# Create a DataFrame for the missing team with all years
missing_team_df = pd.DataFrame({
    'Team': missing_team,
    'Year': list(all_years),
    'Wins': [None] * len(all_years),
    'Losses': [None] * len(all_years),
    'First Name': [None] * len(all_years),
    'Last Name': [None] * len(all_years)
})

# Append the missing team data to the original DataFrame
df_filtered_updated = pd.concat([df_filtered, missing_team_df], ignore_index=True)

# Print to confirm
print("Updated DataFrame with missing team added:")
print(df_filtered_updated.head())

Updated DataFrame with missing team added:
             Team  Year Wins Losses First Name Last Name  Win Percentage  \
0          Temple  2011    9      4      Steve   Addazio        0.692308   
1          Temple  2012    4      7      Steve   Addazio        0.363636   
2  Boston College  2013    7      6      Steve   Addazio        0.538462   
3  Boston College  2014    7      6      Steve   Addazio        0.538462   
4  Boston College  2015    3      9      Steve   Addazio        0.250000   

   Coach Win Percentage  
0              0.458466  
1              0.458466  
2              0.458466  
3              0.458466  
4              0.458466  


In [13]:
# Define the range of years
start_year = df_filtered['Year'].min()
end_year = df_filtered['Year'].max()

# Get unique teams
teams = df_filtered['Team'].unique()

# Create a DataFrame with all team-year combinations
all_years = list(range(start_year, end_year + 1))
all_combinations = pd.MultiIndex.from_product([teams, all_years], names=['Team', 'Year']).to_frame(index=False)

In [14]:
# Ensure 'Year' column is in integer format for consistency
df_filtered['Year'] = df_filtered['Year'].astype(int)

# Merge with the existing DataFrame to ensure all teams have an entry for each year
df_complete = pd.merge(all_combinations, df_filtered, on=['Team', 'Year'], how='left')

In [15]:
# Drop the Wins, Losses, and Win Percentage columns
df_complete = df_complete.drop(columns=['Wins', 'Losses', 'Win Percentage'])

# Fill Coach Win Percentage with 0.50 where either First Name or Last Name is missing
df_complete['Coach Win Percentage'] = df_complete.apply(
    lambda row: 0.50 if pd.isnull(row['First Name']) or pd.isnull(row['Last Name']) else row['Coach Win Percentage'],
    axis=1
)

In [16]:
df_complete.head()

Unnamed: 0,Team,Year,First Name,Last Name,Coach Win Percentage
0,Temple,2001,Bobby,Wallace,0.192424
1,Temple,2002,Bobby,Wallace,0.192424
2,Temple,2003,Bobby,Wallace,0.192424
3,Temple,2004,Bobby,Wallace,0.192424
4,Temple,2005,Bobby,Wallace,0.192424


In [18]:
# Assuming df_coaching is your DataFrame with coaching data
# Filter data to get the last year for each team
last_year_data = df_complete[df_complete['Year'] == 2023]

# Create a new DataFrame for 2024 entries
new_entries_2024 = last_year_data.copy()
new_entries_2024['Year'] = 2024

# Append the new entries to the original DataFrame
df_complete = pd.concat([df_complete, new_entries_2024], ignore_index=True)

# Display the updated DataFrame to verify
print(df_complete.head())
print(df_complete.tail())

     Team  Year First Name Last Name  Coach Win Percentage
0  Temple  2001      Bobby   Wallace              0.192424
1  Temple  2002      Bobby   Wallace              0.192424
2  Temple  2003      Bobby   Wallace              0.192424
3  Temple  2004      Bobby   Wallace              0.192424
4  Temple  2005      Bobby   Wallace              0.192424
                    Team  Year First Name  Last Name  Coach Win Percentage
3373        Old Dominion  2024      Ricky      Rahne              0.391026
3374             Georgia  2024      Kirby      Smart              0.848352
3375            Oklahoma  2024      Brent   Venables              0.615385
3376  Jacksonville State  2024       Rich  Rodriguez              0.588980
3377       SOUTH FLORIDA  2024        NaN        NaN              0.500000


In [19]:
# Ensure DataFrame is sorted by Team and Year
df_complete = df_complete.sort_values(by=['Team', 'Year'])

# Initialize the Coach Tenure column
df_complete['Coach Tenure'] = 1

# Function to calculate coach tenure
def calculate_coach_tenure(df):
    # Fill NaN values in coach names to handle missing coaches
    df = df.fillna({'First Name': 'NoCoach', 'Last Name': 'NoCoach'})

    # Calculate tenure
    df['Coach Tenure'] = df.groupby(['Team', 'First Name', 'Last Name']).cumcount() + 1
    return df

# Apply the function to the entire DataFrame
df_complete = df_complete.groupby('Team').apply(calculate_coach_tenure).reset_index(drop=True)

# Ensure teams with no coach get a tenure of 1
df_complete.loc[df_complete['First Name'] == 'NoCoach', 'Coach Tenure'] = 1

print(df_complete.head())

        Team  Year First Name Last Name  Coach Win Percentage  Coach Tenure
0  Air Force  2001     Fisher   DeBerry              0.475039             1
1  Air Force  2002     Fisher   DeBerry              0.475039             2
2  Air Force  2003     Fisher   DeBerry              0.475039             3
3  Air Force  2004     Fisher   DeBerry              0.475039             4
4  Air Force  2005     Fisher   DeBerry              0.475039             5


In [20]:
# Drop the First Name and Last Name columns
df_complete = df_complete.drop(columns=['First Name', 'Last Name'])

# Display the updated DataFrame
print(df_complete.head())

        Team  Year  Coach Win Percentage  Coach Tenure
0  Air Force  2001              0.475039             1
1  Air Force  2002              0.475039             2
2  Air Force  2003              0.475039             3
3  Air Force  2004              0.475039             4
4  Air Force  2005              0.475039             5


In [21]:
# Filter the DataFrame to keep only rows where the Year is 2005 or later
df_filtered = df_complete[df_complete['Year'] >= 2005]

# Display the updated DataFrame
print(df_filtered.head())

        Team  Year  Coach Win Percentage  Coach Tenure
4  Air Force  2005              0.475039             5
5  Air Force  2006              0.475039             6
6  Air Force  2007              0.606012             1
7  Air Force  2008              0.606012             2
8  Air Force  2009              0.606012             3


In [22]:
# Define the folder path in Google Drive where the file will be saved
folder_path = '/content/drive/My Drive/CFB_Model/PreProcessed Data/'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the CSV file directly to Google Drive
csv_file_path = os.path.join(folder_path, 'coaching_2005_2024.csv')
df_filtered.to_csv(csv_file_path, index=False)

print(f"File saved to: {csv_file_path}")

File saved to: /content/drive/My Drive/CFB_Model/PreProcessed Data/coaching_2005_2024.csv
