In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

data = pd.read_csv('/Users/nick/Desktop/imputed_data.csv')

# Select subset of independent variables
pass_cols = ['passyds20','passtd20','passint20','passyds19','passtd19','passint19']  
rush_cols = ['rushyds20','rushyds19','rushtd20','rush1st20', 'rushtd19','rush1st19']
rec_cols = ['recyds20', 'rectd20', 'rec1st20', 'recyds19', 'rectd19', 'rec1st19']
subset1_data = data[pass_cols]
subset2_data = data[rush_cols]
subset3_data = data[rec_cols]


# Data preprocessing
scaler = StandardScaler()
subset1_scaled = scaler.fit_transform(subset1_data)
subset2_scaled = scaler.fit_transform(subset2_data)
subset3_scaled = scaler.fit_transform(subset3_data)

# PCA on the subset
num_components = 3 
pca = PCA(n_components=num_components)
subset1_pca = pca.fit_transform(subset1_scaled)
subset2_pca = pca.fit_transform(subset2_scaled)
subset3_pca = pca.fit_transform(subset3_scaled)

data['pass_latent'] = subset1_pca[:, 0]
data['rush_latent'] = subset2_pca[:, 0]
data['rec_latent'] = subset3_pca[:, 0]

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import warnings

df = pd.read_csv('/Users/nick/Desktop/imputed_data.csv')

df['pass_latent'] = subset1_pca[:, 0]
df['rush_latent'] = subset2_pca[:, 0]
df['rec_latent'] = subset3_pca[:, 0]

bad_cols = ['gp_21', 'points21', 'projected22','actual19', 'actual21', 'passyd_21', 'passtds21', 'int21', 'runyds21', 'runtds21', 'run1st21', 'rec21', 'recyds21', 'rectds21', 'pass1st21', 'returnyds21', 'returntds21', 'twopoint21', 'fumble21']
pass_cols = ['passyds20','passtd20','passint20','passyds19','passtd19','passint19']  
rush_cols = ['rushyds20','rushyds19','rushtd20','rush1st20', 'rushtd19','rush1st19']
rec_cols = ['recyds20', 'rectd20', 'rec1st20', 'recyds19', 'rectd19', 'rec1st19']

# Split the dataset into IV and target Y (gini)
X = df.drop(columns = bad_cols + pass_cols + rush_cols + rec_cols)
y = df['points21']

# Impute missing values with knn
#imputer = KNNImputer(n_neighbors=10)
#X_imputed = imputer.fit_transform(X)
# drop na
#df.dropna(inplace=True)

# standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# set up k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# define a range of alpha values and max_iter values to try
alphas = [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0]
max_iters = [100, 500, 1000, 5000, 10000]

#store results for each alpha and max_iter combination
results = {}

#store coefficients for each alpha and max_iter combination
coefficients_mapping = {}

#store selected features for each alpha and max_iter combination
selected_features_mapping = {}

#store R-squared values for each alpha and max_iter combination
r_squared_mapping = {}

#feature selection based on coefficient magnitude max threshold
coeff_threshold = 0.01  # Adjust as needed

#top features to select
top_n_features = 4  # Adjust as needed

#i get a lot of warnings when I run the code so this hides them
warnings.filterwarnings("ignore", category=UserWarning)

# iterate through different alpha values and max_iter values
for alpha in alphas:
    for max_iter in max_iters:
        mse_scores = []
        r_squared_scores = []

        #k-fold cross-validation
        for train_index, test_index in kf.split(X_scaled):
            X_train, X_test = X_scaled[train_index], X_scaled[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            #lasso regression model
            lasso_model = Lasso(alpha=alpha, max_iter=max_iter)

            #fit the model on the training data
            lasso_model.fit(X_train, y_train)

            #make predictions on the test set
            y_pred = lasso_model.predict(X_test)

            #eval the model
            mse = mean_squared_error(y_test, y_pred)
            mse_scores.append(mse)

            #calc R-squared
            r_squared = r2_score(y_test, y_pred)
            r_squared_scores.append(r_squared)

        #average mse and R-squared across folds
        avg_mse = np.mean(mse_scores)
        avg_r_squared = np.mean(r_squared_scores)

        #store the results for this alpha and max_iter combination
        results[(alpha, max_iter)] = avg_mse
        r_squared_mapping[(alpha, max_iter)] = avg_r_squared

        #store the coefficients for this alpha and max_iter combination
        coefficients_mapping[(alpha, max_iter)] = dict(zip(X.columns, lasso_model.coef_))

        #features based on coefficient magnitude
        selected_features = [feature for feature, coefficient in sorted(zip(X.columns, lasso_model.coef_), key=lambda x: abs(x[1]), reverse=True)[:top_n_features]]
        selected_features_mapping[(alpha, max_iter)] = selected_features

# find the alpha and max_iter with the lowest average mean squared error
best_params = min(results, key=results.get)
best_alpha, best_max_iter = best_params
best_mse = results[best_params]
best_r_squared = r_squared_mapping[best_params]

# show the best alpha, max_iter, and their corresponding mean squared error and R-squared
print(f'Best Alpha: {best_alpha}')
print(f'Best Max Iterations: {best_max_iter}')
print(f'Corresponding Mean Squared Error: {best_mse}')
print(f'Corresponding R-squared: {best_r_squared}')

# print coefficients for the best alpha and max_iter
best_coefficients = coefficients_mapping[best_params]
print("\nCoefficients for the Best Alpha and Max Iterations:")
for feature, coefficient in best_coefficients.items():
    print(f'{feature}: {coefficient}')

# print selected features for the best alpha and max_iter
selected_features = selected_features_mapping[best_params]
print("\nSelected Features for the Best Alpha and Max Iterations:")
print(selected_features)

# show the coefficients using the best alpha and max_iter
lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
lasso_model.fit(X_scaled, y)

plt.figure(figsize=(10, 6))
plt.plot(range(len(lasso_model.coef_)), lasso_model.coef_, marker='o', linestyle='None')
plt.title('Lasso Coefficients with Best Alpha and Max Iterations')
plt.xlabel('Coefficient Index')
plt.ylabel('Coefficient Value')
plt.show()