In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import precision_score, recall_score, f1_score

#where the csv files are stored

In [2]:
folder_path = 'DATA'

#all csv files for RBs

In [3]:
datasets = ['RB_DATA_1993.csv', 'RB_DATA_1994.csv', 'RB_DATA_1995.csv', 'RB_DATA_1996.csv', 'RB_DATA_1997.csv', 'RB_DATA_1998.csv', 'RB_DATA_1999.csv', 'RB_DATA_2000.csv', 'RB_DATA_2002.csv', 'RB_DATA_2003.csv', 'RB_DATA_2004.csv', 'RB_DATA_2005.csv', 'RB_DATA_2006.csv', 'RB_DATA_2007.csv', 'RB_DATA_2008.csv', 'RB_DATA_2009.csv','RB_DATA_2010.csv', 'RB_DATA_2011.csv', 'RB_DATA_2012.csv', 'RB_DATA_2013.csv', 'RB_DATA_2014.csv', 'RB_DATA_2015.csv','RB_DATA_2016.csv', 'RB_DATA_2017.csv', 'RB_DATA_2018.csv', 'RB_DATA_2019.csv', 'RB_DATA_2020.csv', 'RB_DATA_2021.csv', 'RB_DATA_2022.csv', 'RB_DATA_2023.csv' ]  # List to store datasets

#list to store player names

In [4]:
player_names_all = []

#Lists to store training and testing results

In [5]:
train_results = []
test_results = []

#Create an empty set to store all unique teams and conferences across datasets

In [6]:
all_teams = set()
all_confs = set()

#collecting a full set of unique teams and conferences to use them for one-hot encoding

In [7]:
# First pass through datasets to collect all unique teams and conferences
for dataset in datasets:
    file_path = os.path.join(folder_path, dataset)
    df = pd.read_csv(file_path)
    
    # Collect all unique teams and conferences
    all_teams.update(df['Team'].unique())
    all_confs.update(df['Conf'].unique())

all_teams = sorted(list(all_teams))
all_confs = sorted(list(all_confs))

#Loop through datasets to process them, lots of code in the loop so it is commented

In [8]:
for dataset in datasets:
    # Extract the year from the dataset name
    year = dataset.split('_')[-1].split('.')[0]
    file_path = os.path.join(folder_path, dataset)
    df = pd.read_csv(file_path)
    
    # Ensure 'Awards' column exists and clean it
    if 'Awards' not in df.columns:
        print(f"Warning: 'Awards' column not found in {dataset}. Skipping this dataset.")
        continue
    
    df['Awards'] = df['Awards'].fillna(0).astype(int)  # Clean 'Awards' column

    # Check for Heisman winner (only one should be present)
    heisman_winners = df[df['Awards'] == 1]
    if len(heisman_winners) != 1:
        print(f"Warning: More than one or no Heisman winner found in {year} dataset!")
        continue  # Skip this dataset if there's no valid Heisman winner
    
    # Track player names
    player_names_all.append(df['Player'].values)
    
    # Prepare features and target
    X = df.drop(['Awards', 'Player'], axis=1).select_dtypes(include=['number'])
    y = df['Awards']

    # One-Hot Encoding for 'Team' and 'Conf', ensuring consistency across datasets
    X_encoded = pd.concat([
        X,
        pd.get_dummies(df['Team'], prefix='Team').reindex(columns=all_teams, fill_value=0),
        pd.get_dummies(df['Conf'], prefix='Conf').reindex(columns=all_confs, fill_value=0)
    ], axis=1)

    # Reset index for consistency
    df = df.reset_index(drop=True)
    
    # Check if this year should be used for testing or training
    if year[-1] in ['0', '5']:  # Testing set for years ending in '0' or '5'
        # Split data into test set
        X_test = X_encoded
        y_test = y

        # Scale the data for testing
        scaler = StandardScaler()
        X_test_scaled = scaler.fit_transform(X_test)
        
        # Build the model
        model = tf.keras.Sequential([ 
            tf.keras.layers.Dense(128, activation='relu', input_shape=(X_test_scaled.shape[1],)),  # Increased complexity
            tf.keras.layers.Dropout(0.2),  # Add dropout to prevent overfitting
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(0.2),  # Add dropout here as well
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid') 
        ])

        # Compile the model with adjusted learning rate and class weight
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),  # Adjust learning rate
                    loss='binary_crossentropy',
                    metrics=['accuracy'])
        
        # Train the model on the training set from previous years
        if len(train_results) > 0:
            X_train_scaled = np.concatenate([result['X_train_scaled'] for result in train_results], axis=0)
            y_train = np.concatenate([result['y_train'] for result in train_results], axis=0)
            
            model.fit(X_train_scaled, y_train, epochs=30, batch_size=32, verbose=0)

            # Predict on the test set and get probabilities
            y_pred_prob = model.predict(X_test_scaled)

            # Debugging: Print out the predicted probabilities to check for improvement
            print(f"Predicted probabilities for year {year}:")
            print(y_pred_prob[:10])  # Display first 10 predictions
                    
        # Retrieve player names for the test set
        test_player_names = df.loc[X_test.index, 'Player'].values

        # Prepare the test data for comparison
        test_data = X_test.copy()
        test_data['Actual Heisman'] = y_test.values
        test_data['Player'] = test_player_names
        test_data['Predicted Probability'] = y_pred_prob

        print(f"Data for year {year}:")
        print(test_data[['Player', 'Actual Heisman', 'Predicted Probability']].head()) 

        # Sort by predicted probability to get the top 3 predicted winners
        top_3_predicted = test_data.nlargest(3, 'Predicted Probability')[['Player', 'Predicted Probability']]

        # Get player statistics for the top 3 by looking them up in the original dataset
        top_3_stats = df[df['Player'].isin(top_3_predicted['Player'].values)][[
            'Player', 'Team', 'Conf', 'G', 'Att', 'Yds', 'Y/A', 'TD', 'Y/G', 'Rec', 'Yds.1', 
            'Y/R', 'TD.1', 'Y/G.1', 'Plays', 'Yds.2', 'Avg', 'TD.2'
        ]]

        # Add a custom identifier with format 'player-name-id'
        top_3_stats['-9999'] = top_3_stats['Player'].str.lower().str.replace(' ', '-')
        
        # Save the top 3 players and their stats for this year in the same format as the original CSV
        output_file_path = os.path.join('DATA', f'top_3_players_{year}.csv')
        top_3_stats.to_csv(output_file_path, index=False)

        # Check if there is at least one actual Heisman winner
        actual_winner = test_data[test_data['Actual Heisman'] == 1]
        
        if actual_winner.empty:
            print(f"Warning: No actual Heisman winner found for {year}. Skipping this dataset.")
            continue  # Skip this year if there's no actual winner

        # If there's an actual winner, get the name
        actual_winner_name = actual_winner['Player'].values[0]
        print(f"Actual Heisman Winner for {year}: {actual_winner_name}")  

        # Check if one of the predicted winners matches the actual winner
        predicted_winners = top_3_predicted['Player'].values
        match = 'Yes' if actual_winner_name in predicted_winners else 'No'

        # Store the test results for this year
        test_results.append({
            'Year': year,
            'Predicted Winners': ', '.join(predicted_winners),
            'Actual Winner': actual_winner_name,
            'Match': match
        })

    else:  # Training set for all other years
        # Use the entire dataset as the training set
        X_train = X_encoded
        y_train = y
        
        # Scale the data for training
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)

        # Store training results for later use
        train_results.append({
            'X_train_scaled': X_train_scaled,
            'y_train': y_train
        })
        
test_results_df = pd.DataFrame(test_results)

#printing top 3 from each test year
print(test_results_df)

Predicted probabilities for year 1995:
[[0.25778514]
 [0.25778514]
 [0.25778514]
 [0.25778514]
 [0.25778514]
 [0.25778514]
 [0.25778514]
 [0.25778514]
 [0.25778514]
 [0.25778514]]
Data for year 1995:
                  Player  Actual Heisman  Predicted Probability
0             Troy Davis               0               0.257785
1            Wasean Tait               0               0.257785
2           George Jones               0               0.257785
3           Eddie George               1               0.257785
4  Tshimanga Biakabutuka               0               0.257785
Actual Heisman Winner for 1995: Eddie George
Predicted probabilities for year 2000:
[[0.05365385]
 [0.05365385]
 [0.05365385]
 [0.05365385]
 [0.05365385]
 [0.05365385]
 [0.05365385]
 [0.05365385]
 [0.05365385]
 [0.05365385]]
Data for year 2000:
                Player  Actual Heisman  Predicted Probability
0  LaDainian Tomlinson               1               0.053654
1      Damien Anderson               0         