In [2]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


def lassoFeatures(data):
    
    target = data.pop("rating_label")

    # Split the data into training, validation, and testing sets
    X_train, X_val, y_train, y_val = train_test_split(data, target, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

    # Initialize a list to store the performance metrics
    mse_scores = []

    # Define a range of alpha values to try
    alpha_values = [0.1, 0.3, 0.5, 0.8, 1.0, 1.5]

    # Iterate over the alpha values
    for alpha in alpha_values:
        # Initialize and fit the Lasso model
        lasso = Lasso(alpha=alpha)
        lasso.fit(X_train, y_train)

        # Predict on the validation set
        y_pred = lasso.predict(X_val)

        # Calculate the mean squared error (MSE)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    # Find the alpha value with the lowest MSE
    best_alpha = alpha_values[mse_scores.index(min(mse_scores))]

    # Train the final model with the best alpha value on the combined training and validation sets
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])

    lasso = Lasso(alpha=best_alpha)
    lasso.fit(X_train_val, y_train_val)

    # Get the coefficients
    lasso_coef = lasso.coef_

    # Print the feature importances
    #for feature, coef in zip(data.columns, lasso_coef):
     #   print(f"{feature}: {coef}")
        
    
    return lasso_coef


def trimFeatures(filename, k = 10):
    
    #k is how many features to select
    
    data = pd.read_csv(filename)
    
    lasso_coef = lassoFeatures(data)
    
    # Getting magnitudes of coefficients, ranking them
    
    lasso_coef_abs = abs(lasso_coef)
    feature_ranking = sorted(range(len(lasso_coef_abs)), key=lambda k: lasso_coef_abs[k], reverse=True)
    
    
    #selecting most influential features, returning as list of names
    selected_features = data.columns[feature_ranking[:k]]
    
    return selected_features


In [None]:
'''The above code was adapted from code generated by the AI tool, chatGPT'''