In [13]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import classification_report

In [14]:
folder_path = 'DATA'

In [15]:
datasets = ['RB_DATA_1993.csv', 'RB_DATA_1994.csv', 'RB_DATA_1995.csv', 'RB_DATA_1996.csv', 'RB_DATA_1997.csv', 'RB_DATA_1998.csv', 'RB_DATA_1999.csv', 'RB_DATA_2000.csv', 'RB_DATA_2002.csv', 'RB_DATA_2003.csv', 'RB_DATA_2004.csv', 'RB_DATA_2005.csv', 'RB_DATA_2006.csv', 'RB_DATA_2007.csv', 'RB_DATA_2008.csv', 'RB_DATA_2009.csv','RB_DATA_2010.csv', 'RB_DATA_2011.csv', 'RB_DATA_2012.csv', 'RB_DATA_2013.csv', 'RB_DATA_2014.csv', 'RB_DATA_2015.csv','RB_DATA_2016.csv', 'RB_DATA_2017.csv', 'RB_DATA_2018.csv', 'RB_DATA_2019.csv', 'RB_DATA_2020.csv', 'RB_DATA_2021.csv', 'RB_DATA_2022.csv', 'RB_DATA_2023.csv' ]  # List to store datasets

In [16]:
all_data = []

In [17]:
player_names_all = []

In [18]:
results = []

In [7]:
# Initialize the list to store results for each test case
results = []

for dataset in datasets:
    year = dataset.split('_')[-1].split('.')[0]  # Extract the year from the dataset name
    file_path = os.path.join(folder_path, dataset)
    df = pd.read_csv(file_path)
    
    # Ensure 'Awards' column exists and clean it
    if 'Awards' not in df.columns:
        print(f"Warning: 'Awards' column not found in {dataset}. Skipping this dataset.")
        continue
    
    df['Awards'] = df['Awards'].fillna(0).astype(int)  # Clean 'Awards' column
    
    # Track player names
    player_names_all.append(df['Player'].values)
    
    # Prepare features and target
    X = df.drop(['Awards', 'Player'], axis=1).select_dtypes(include=['number'])
    y = df['Awards']
    
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Build the model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(16, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=0)

    # Predict on the test set and get probabilities
    y_pred_prob = model.predict(X_test_scaled)

    # Retrieve player names for the test set
    test_player_names = df.loc[X_test.index, 'Player'].values

    # Prepare the test data for comparison
    test_data = X_test.copy()
    test_data['Actual Heisman'] = y_test
    test_data['Player'] = test_player_names
    test_data['Predicted Probability'] = y_pred_prob

    # Sort by predicted probability to get the top 3 predicted winners
    top_3_predicted = test_data.nlargest(3, 'Predicted Probability')[['Player', 'Predicted Probability']]

    # Check if there is at least one actual Heisman winner
    actual_winner = test_data[test_data['Actual Heisman'] == 1]

    if actual_winner.empty:
        print(f"Warning: No actual Heisman winner found for {year}. Skipping this dataset.")
        continue  # Skip this year if there's no actual winner

    # If there's an actual winner, get the name
    actual_winner_name = actual_winner['Player'].values[0]

    # Check if one of the predicted winners matches the actual winner
    predicted_winners = top_3_predicted['Player'].values
    match = 'Yes' if actual_winner_name in predicted_winners else 'No'

    # Store the results for this year
    results.append({
        'Year': year,
        'Predicted Winners': ', '.join(predicted_winners),
        'Actual Winner': actual_winner_name,
        'Match': match
    })

# Convert results to DataFrame for better readability
results_df = pd.DataFrame(results)

# Output the results
print(results_df)


    Year                                  Predicted Winners  \
0   1994           Tony Nibbs, Napoleon Kaufman, Alex Smith   
1   1996         Troy Davis, Antowain Smith, Ricky Williams   
2   1997      Ricky Williams, De'Mond Parker, Michael Black   
3   1998          Ricky Williams, Doug Chapman, Joel Thomas   
4   1999     LaDainian Tomlinson, Ron Dayne, Chester Taylor   
5   2000      LaDainian Tomlinson, William Green, Lee Suggs   
6   2002      LaDainian Tomlinson, William Green, Lee Suggs   
7   2008       Collin Mooney, Dennis Kennedy, Shonn Greene*   
8   2010    Knile Davis*, LaMichael James*, Kendall Hunter*   
9   2011           Montee Ball*, Isi Sofele*, Rex Burkhead*   
10  2013  Andre Williams*, Keenan Reynolds*, Jeremy Lang...   
11  2014     Jarvion Franklin*, Melvin Gordon*, Leon Allen*   
12  2015   Dalvin Cook*, Elijah Hood*, Christian McCaffrey*   
13  2016        Donnel Pumphrey*, Joe Williams*, Ito Smith*   
14  2017          Rashaad Penny*, J.K. Dobbins*, Zach A

In [19]:
# Lists to store training and testing results
train_results = []
test_results = []

# Loop through datasets
for dataset in datasets:
    # Extract the year from the dataset name
    year = dataset.split('_')[-1].split('.')[0]
    file_path = os.path.join(folder_path, dataset)
    df = pd.read_csv(file_path)
    
    # Ensure 'Awards' column exists and clean it
    if 'Awards' not in df.columns:
        print(f"Warning: 'Awards' column not found in {dataset}. Skipping this dataset.")
        continue
    
    df['Awards'] = df['Awards'].fillna(0).astype(int)  # Clean 'Awards' column

    # Check for Heisman winner (only one should be present)
    heisman_winners = df[df['Awards'] == 1]
    if len(heisman_winners) != 1:
        print(f"Warning: More than one or no Heisman winner found in {year} dataset!")
        continue  # Skip this dataset if there's no valid Heisman winner
    
    # Track player names
    player_names_all.append(df['Player'].values)
    
    # Prepare features and target
    X = df.drop(['Awards', 'Player'], axis=1).select_dtypes(include=['number'])
    y = df['Awards']

    df = df.reset_index(drop=True)
    
    # Check if this year should be used for testing or training
    if year[-1] in ['0', '5']:  # Testing set for years ending in '0' or '5'
        # Split data into test set
        X_test = X
        y_test = y

        # Scale the data for testing
        scaler = StandardScaler()
        X_test_scaled = scaler.fit_transform(X_test)
        
        # Build the model
        model = tf.keras.Sequential([ 
            tf.keras.layers.Dense(16, activation='relu', input_shape=(X_test_scaled.shape[1],)), 
            tf.keras.layers.Dense(16, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid') 
        ])
        
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
        # Train the model on the training set from previous years
        if len(train_results) > 0:
            X_train_scaled = np.concatenate([result['X_train_scaled'] for result in train_results], axis=0)
            y_train = np.concatenate([result['y_train'] for result in train_results], axis=0)
            model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=0)
        
        # Predict on the test set and get probabilities
        y_pred_prob = model.predict(X_test_scaled)
        
        # Retrieve player names for the test set
        test_player_names = df.loc[X_test.index, 'Player'].values

        # Prepare the test data for comparison
        test_data = X_test.copy()
        test_data['Actual Heisman'] = y_test.values  # Correctly assign actual Heisman winners
        test_data['Player'] = test_player_names
        test_data['Predicted Probability'] = y_pred_prob

        # Debugging: Print data to check 'Awards' column and ensure it's being processed
        print(f"Data for year {year}:")
        print(test_data[['Player', 'Actual Heisman', 'Predicted Probability']].head())  # Debugging output
        
        # Sort by predicted probability to get the top 3 predicted winners
        top_3_predicted = test_data.nlargest(3, 'Predicted Probability')[['Player', 'Predicted Probability']]

        # Check if there is at least one actual Heisman winner
        actual_winner = test_data[test_data['Actual Heisman'] == 1]
        
        if actual_winner.empty:
            print(f"Warning: No actual Heisman winner found for {year}. Skipping this dataset.")
            continue  # Skip this year if there's no actual winner

        # If there's an actual winner, get the name
        actual_winner_name = actual_winner['Player'].values[0]
        print(f"Actual Heisman Winner for {year}: {actual_winner_name}")  # Debugging check
        
        # Check if one of the predicted winners matches the actual winner
        predicted_winners = top_3_predicted['Player'].values
        match = 'Yes' if actual_winner_name in predicted_winners else 'No'

        # Store the test results for this year
        test_results.append({
            'Year': year,
            'Predicted Winners': ', '.join(predicted_winners),
            'Actual Winner': actual_winner_name,
            'Match': match
        })

    else:  # Training set for all other years
        # Split data into training set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Scale the data for training
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)

        # Store training results for later use
        train_results.append({
            'X_train_scaled': X_train_scaled,
            'y_train': y_train
        })
        
# Convert results to DataFrame for better readability
test_results_df = pd.DataFrame(test_results)

# Output the results for test set
print(test_results_df)


Data for year 1995:
                  Player  Actual Heisman  Predicted Probability
0             Troy Davis               0               0.494180
1            Wasean Tait               0               0.461517
2           George Jones               0               0.380221
3           Eddie George               1               0.456592
4  Tshimanga Biakabutuka               0               0.521028
Actual Heisman Winner for 1995: Eddie George
Data for year 2000:
                Player  Actual Heisman  Predicted Probability
0  LaDainian Tomlinson               1               0.477763
1      Damien Anderson               0               0.477763
2      Michael Bennett               0               0.477763
3      Deonce Whitaker               0               0.477763
4       Robert Sanford               0               0.477763
Actual Heisman Winner for 2000: LaDainian Tomlinson
Data for year 2005:
               Player  Actual Heisman  Predicted Probability
0  DeAngelo Williams*     

In [8]:
for dataset in datasets:
    year = dataset.split('_')[-1].split('.')[0]  # Extract the year from the dataset name
    file_path = os.path.join(folder_path, dataset)
    df = pd.read_csv(file_path)
    
    # Ensure 'Awards' column exists and clean it
    if 'Awards' not in df.columns:
        print(f"Warning: 'Awards' column not found in {dataset}. Skipping this dataset.")
        continue
    
    df['Awards'] = df['Awards'].fillna(0).astype(int)  # Clean 'Awards' column

    heisman_winners = df[df['Awards'] == 1]
    if len(heisman_winners) != 1:
        print(f"Warning: More than one or no Heisman winner found in {year} dataset!")
        continue  # Skip this dataset if there's no valid Heisman winner
    
    # Track player names
    player_names_all.append(df['Player'].values)
    
    # Prepare features and target
    X = df.drop(['Awards', 'Player'], axis=1).select_dtypes(include=['number'])
    y = df['Awards']

    df = df.reset_index(drop=True)
    
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Build the model
    model = tf.keras.Sequential([ 
        tf.keras.layers.Dense(16, activation='relu', input_shape=(X_train_scaled.shape[1],)), 
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid') 
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=0)

    # Predict on the test set and get probabilities
    y_pred_prob = model.predict(X_test_scaled)

    # Retrieve player names for the test set
    test_player_names = df.loc[X_test.index, 'Player'].values

    # Prepare the test data for comparison
    test_data = X_test.copy()
    test_data['Actual Heisman'] = y_test.values  # Fix here: correctly assign actual Heisman winners
    test_data['Player'] = test_player_names
    test_data['Predicted Probability'] = y_pred_prob

    # Debugging: Print data to check 'Awards' column and ensure it's being processed
    print(f"Data for year {year}:")
    print(test_data[['Player', 'Actual Heisman', 'Predicted Probability']].head())  # Debugging output
    
    # Sort by predicted probability to get the top 3 predicted winners
    top_3_predicted = test_data.nlargest(3, 'Predicted Probability')[['Player', 'Predicted Probability']]

    # Check if there is at least one actual Heisman winner
    actual_winner = test_data[test_data['Actual Heisman'] == 1]
    
    if actual_winner.empty:
        print(f"Warning: No actual Heisman winner found for {year}. Skipping this dataset.")
        continue  # Skip this year if there's no actual winner

    # If there's an actual winner, get the name
    actual_winner_name = actual_winner['Player'].values[0]
    print(f"Actual Heisman Winner for {year}: {actual_winner_name}")  # Debugging check
    
    # Check if one of the predicted winners matches the actual winner
    predicted_winners = top_3_predicted['Player'].values
    match = 'Yes' if actual_winner_name in predicted_winners else 'No'

    # Store the results for this year
    results.append({
        'Year': year,
        'Predicted Winners': ', '.join(predicted_winners),
        'Actual Winner': actual_winner_name,
        'Match': match
    })

# Convert results to DataFrame for better readability
results_df = pd.DataFrame(results)

# Output the results
print(results_df)

Data for year 1993:
            Player  Actual Heisman  Predicted Probability
0   LeShon Johnson               0               0.554541
17      John Leach               0               0.291810
15     Mario Bates               0               0.253926
1       Bam Morris               0               0.508798
Data for year 1994:
               Player  Actual Heisman  Predicted Probability
0      Rashaan Salaam               1               0.057956
17         Tony Nibbs               0               0.487921
15  Stephen Whitfield               0               0.419387
1        Brian Pruitt               0               0.166687
Actual Heisman Winner for 1994: Rashaan Salaam
Data for year 1995:
               Player  Actual Heisman  Predicted Probability
0          Troy Davis               0               0.508252
17  Toraino Singleton               0               0.365628
15         Madre Hill               0               0.376002
1         Wasean Tait               0               0.

'RB_DATA_1993.csv'

In [None]:
import pandas as pd

# Load the working CSV and the non-working CSV
working_file = 'DATA/RB_DATA_1994.csv'
non_working_file = 'DATA/RB_DATA_2018.csv'

df_working = pd.read_csv(working_file)
df_non_working = pd.read_csv(non_working_file)

# Display the first few rows of both CSVs for comparison
print("Working CSV:")
print(df_working.head())

print("Non-Working CSV:")
print(df_non_working.head())


In [None]:
#print(df.columns)

In [19]:
for dataset in datasets:
    if dataset.endswith('.csv'):  # Only process CSV files
        # Build the full file path
        file_path = os.path.join(folder_path, dataset)
        
        # Read the dataset from the file
        df = pd.read_csv(file_path)
        
        # Ensure the 'Awards' column is filled with 0 for non-MVPs
        df['Awards'] = df['Awards'].fillna(0)  # Replace NaN (or blank) with 0 for non-MVP
        df['Awards'] = df['Awards'].astype(int)  # Ensure 'Awards' column is integer (0 or 1)
        
        # Store player names for later comparison
        player_names_all.append(df[['Player', 'Awards']])  # Store player names and MVP status

        # Define X (features) and y (target column)
        X = df.drop(['Awards', 'Player'], axis=1)  # Drop 'Awards' and 'Player' columns from features
        y = df['Awards']  # 'Awards' is the target column
        
        # Optionally, drop any non-numeric columns from X (e.g., player names)
        X = X.select_dtypes(include=['number'])  # Keep only numeric columns in the features

        # Combine features and target into a DataFrame
        combined_data = pd.concat([X, y], axis=1)
        all_data.append(combined_data)

# Combine all data from different years into a single DataFrame
combined_df = pd.concat(all_data, axis=0)

In [None]:
X_all = combined_df.drop(['Awards', 'Player'], axis=1)  # Drop 'Awards' and 'Player' from features
y_all = combined_df['Awards']  # 'Awards' is the target column

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification (Heisman or not)
])

In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test))

In [None]:
y_pred = model.predict(X_test_scaled)
y_pred = (y_pred > 0.5).astype(int)

In [23]:
all_player_names = [item['Player'].values for item in player_names_all]
all_player_names = [name for sublist in all_player_names for name in sublist] 

In [24]:
test_player_names = all_player_names[:len(X_test)] 

In [25]:
test_data = X_test.copy()
test_data['Actual Heisman Winner'] = y_test
test_data['Predicted Heisman Winner'] = y_pred
test_data['Player'] = test_player_names 

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
print(test_data.head())

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('DATA/RB_DATA_1993.csv')

# Preview the first few rows
print(df.head())

# Check for missing values in critical columns
print(df[['Player', 'Awards']].isnull().sum())

# Ensure 'Awards' column only contains 0 or 1
print(df['Awards'].value_counts())

# Ensure the 'Player' column is unique (if used for matching predictions)
print(df['Player'].nunique(), len(df))

# Example of adding a 'Predicted Heisman' column based on some logic or model (this would normally be predicted by your model)
df['Predicted Heisman'] = 0  # Replace this with actual model predictions
df.loc[df['Player'] == 'Marshall Faulk', 'Predicted Heisman'] = 1  # Example for setting the prediction

# Check the result
print(df[['Player', 'Awards', 'Predicted Heisman']])

In [None]:
for dataset in datasets:
    # Build the full file path
    file_path = os.path.join(folder_path, dataset)
    
    # Read the dataset from the file
    df = pd.read_csv(file_path)
    
    # Ensure the 'Awards' column is filled and clean (1 for Heisman winner, 0 for non-winner)
    if 'Awards' not in df.columns:
        print(f"Warning: 'Awards' column not found in {dataset}. Skipping this dataset.")
        continue  # Skip this dataset if it does not contain the 'Awards' column
    
    # Replace NaN or missing values in the Awards column with 0 (non-Heisman)
    df['Awards'] = df['Awards'].fillna(0).astype(int)  # Ensure the Awards column is integer (0 or 1)
    
    # Define X (features) and y (target)
    X = df.drop('Awards', axis=1)  # Drop the Awards column to get features
    y = df['Awards']  # The target column
    
    # Optionally, drop any non-numeric columns from X (e.g., player names)
    X = X.select_dtypes(include=['number'])  # Keep only numeric columns
    
    # Split the data into training and testing sets (optional, if not already split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define the model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(16, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification (Heisman or not)
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=0)

y_pred_prob = model.predict(X_test_scaled)

# Add player names back to the test data (if present in the original dataset)
test_data = X_test.copy()
test_data['Player'] = df.loc[X_test.index, 'Player']
test_data['Actual Heisman'] = y_test
test_data['Predicted Heisman'] = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary

# Ensure only one predicted winner (highest probability)
test_data['Predicted Heisman'] = 0  # Reset all to 0
max_prob_index = y_pred_prob.argmax()  # Find index of the highest predicted probability
test_data.iloc[max_prob_index, test_data.columns.get_loc('Predicted Heisman')] = 1  # Mark the highest as 1

# Step 1: Identify the player predicted to win the Heisman (Predicted Heisman == 1)
predicted_winner = test_data[test_data['Predicted Heisman'] == 1]

# Step 2: Identify the actual player who won the Heisman (Actual Heisman == 1)
actual_winner = test_data[test_data['Actual Heisman'] == 1]

# Output results
if not predicted_winner.empty and not actual_winner.empty:
    print(f"Predicted Heisman winner: {predicted_winner['Player'].values[0]}")
    print(f"Actual Heisman winner: {actual_winner['Player'].values[0]}")
    if predicted_winner['Player'].values[0] == actual_winner['Player'].values[0]:
        print("The predicted Heisman winner matches the actual winner!")
    else:
        print("The predicted Heisman winner does NOT match the actual winner.")
else:
    print("There was an issue with the prediction or actual Heisman values.")


In [None]:
print(df['Awards'].value_counts())


In [None]:
predicted_winner = test_data[test_data['Predicted Heisman Winner'] == 1]

actual_winner = test_data[test_data['Actual Heisman Winner'] == 1]

In [None]:
if not predicted_winner.empty and not actual_winner.empty:
    predicted_player = predicted_winner['Player'].values[0]
    actual_player = actual_winner['Player'].values[0]
    print(f"Predicted Heisman Winner: {predicted_player}")
    print(f"Actual Heisman Winner: {actual_player}")
    
    if predicted_player == actual_player:
        print("The predicted Heisman winner matches the actual Heisman winner!")
    else:
        print("The predicted Heisman winner does NOT match the actual Heisman winner.")
else:
    print("No Heisman winner in the test data.")

In [11]:
for year in years:
    file_path = f'DATA/RB_DATA_{year}.csv'  # Construct the file path for each dataset based on the year
    if os.path.exists(file_path):  # Check if the file exists
        df = pd.read_csv(file_path, header=None)  # Read the dataset
        columns = ['Rk', 'Player', 'Team', 'Conf', 'G', 'Att', 'Yds', 'Y/A', 'TD', 'Y/G', 
           'Rec', 'Yds', 'Y/R', 'TD', 'Y/G', 'Plays', 'Yds', 'Avg', 'TD', 'Awards', '-9999']
        df.columns = columns
        df = df.drop(index=0)
        
        datasets.append(df) 

In [None]:
datasets[4].head()

In [None]:
# Assuming you have already split the data into train_data and test_data

# Extract features (X) and target (y) from train_data and test_data
X_train = train_datasets.drop('AWARDS', axis=1)  # Assuming 'MVP' is the target
y_train = train_datasets['AWARDS']

X_test = test_datasets.drop('AWARDS', axis=1)
y_test = test_datasets['AWARDS']


In [None]:
def prepare_data(dataset):
    # Load the dataset (replace with actual loading if using a DataFrame directly)
    df = pd.read_csv(dataset)  # or if already in DataFrame: df = dataset
    
    # Define features (X) and target (y)
    X = df.drop(target_column, axis=1)  # Drop the target column from features
    y = df[target_column]  # The target column
    
    # Optional: Drop any non-numeric columns if needed, e.g. player names, etc.
    X = X.select_dtypes(include=['number'])  # Keep only numeric columns
    
    # Scale the features (important for neural networks)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # Scale features
    
    return X_scaled, y

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_datasets)
X_test_scaled = scaler.transform(test_datasets)

In [None]:
os.path.exists(file_path)

In [None]:
os.path.getsize(file_path) > 0

In [None]:
for i in range(1993, 2023):  # assuming datasets are named 'data1.csv', 'data2.csv', ..., 'data30.csv'
    df = pd.read_csv(f'DATA//RB_DATA_{i}.csv')  # Load datasets
    datasets.append(df)

In [None]:
import pandas as pd

# Step 1: Read the CSV file with proper handling for delimiters and quotes
data = pd.read_csv('athletes_stats.csv', header=None, delimiter=',', quotechar='"')

# Step 2: Check the first few rows to see if the columns are correctly split
print("First few rows of the data:", data.head())

# Step 3: Manually set the column names (if the first row contains column names)
columns = ['Rk', 'Player', 'Team', 'Conf', 'G', 'Att', 'Yds', 'Y/A', 'TD', 'Y/G', 
           'Rec', 'Yds', 'Y/R', 'TD', 'Y/G', 'Plays', 'Yds', 'Avg', 'TD', 'Awards', '-9999']

# Step 4: Assign the column names to the dataframe
data.columns = columns

# Step 5: Drop the first row if it contains redundant information (since it is now used as columns)
data = data.drop(index=0)

# Step 6: Verify the DataFrame after the changes
print("Data after fixing columns:")
print(data.head())



In [None]:
print(data.head())

In [None]:
print(data.columns)


In [None]:
print("Column names:", data.columns)

In [None]:
# Verify if the columns exist in the DataFrame
print("" in data.columns)
print("year" in data.columns)
print("votes_received" in data.columns)


In [None]:
# Assuming columns are correct, proceed with further operations
X = data.drop(columns=['Rk', 'Player', 'Team' ,'Conf', 'G', 'Att', 'Yds', 'Y/A', 'TD', 'Y/G', 'Rec', 'Yds', 'Y/R', 'TD', 'Y/G', 'Plays', 'Yds', 'Avg', 'TD', 'Awards', '-9999'])  # Drop non-numeric columns
y = data['Awards']  # Target variable