In [1]:
# 03_model_prep.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

def load_preprocessed_data():
    """Load the preprocessed data and return the dataframe"""
    data_path = Path('../data/processed/preprocessed_data.csv')
    if not data_path.exists():
        raise FileNotFoundError("Preprocessed data not found. Please run 01_data_preprocessing first.")
    
    df = pd.read_csv(data_path)
    print(f"Loaded data with shape: {df.shape}")
    return df

class DepressionPredictor:
    def __init__(self, df, test_size=0.2, random_state=42):
        """
        Initialize the depression predictor
        
        Parameters:
        -----------
        df : pandas.DataFrame
            Preprocessed dataframe containing both features and target
        test_size : float
            Proportion of dataset to include in the test split
        random_state : int
            Random state for reproducibility
        """
        self.test_size = test_size
        self.random_state = random_state
        self.scaler = StandardScaler()
        
        # Separate features and target
        self.X = df.drop(['src_subject_id', 'depression_score'], axis=1)
        self.y = df['depression_score']
        
        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=random_state
        )
        
        # Scale features
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)
        
        # Initialize models
        self.lasso_model = None
        self.rf_model = None
        
    def train_lasso(self, cv=5):
        """
        Train logistic regression with LASSO regularization
        
        Parameters:
        -----------
        cv : int
            Number of cross-validation folds
        """
        # Define parameter grid for LASSO
        param_grid = {
            'C': np.logspace(-4, 4, 20),
            'penalty': ['l1'],
            'solver': ['liblinear'],
            'max_iter': [1000]
        }
        
        # Initialize model
        base_model = LogisticRegression(random_state=self.random_state)
        
        # Perform grid search with cross-validation
        self.lasso_model = GridSearchCV(
            base_model, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1
        )
        
        # Fit model
        self.lasso_model.fit(self.X_train_scaled, self.y_train.values.ravel())
        
        # Print results
        print("\nLASSO Logistic Regression Results:")
        print(f"Best parameters: {self.lasso_model.best_params_}")
        print(f"Best cross-validation score: {self.lasso_model.best_score_:.3f}")
        
        # Get feature importance
        feature_importance = pd.DataFrame({
            'feature': self.X_train.columns,
            'coefficient': np.abs(self.lasso_model.best_estimator_.coef_[0])
        })
        feature_importance = feature_importance.sort_values('coefficient', ascending=False)
        
        return feature_importance
    
    def train_random_forest(self, cv=5):
        """
        Train random forest classifier
        
        Parameters:
        -----------
        cv : int
            Number of cross-validation folds
        """
        # Define parameter grid for Random Forest
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        
        # Initialize model
        base_model = RandomForestClassifier(random_state=self.random_state)
        
        # Perform grid search with cross-validation
        self.rf_model = GridSearchCV(
            base_model, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1
        )
        
        # Fit model
        self.rf_model.fit(self.X_train_scaled, self.y_train.values.ravel())
        
        # Print results
        print("\nRandom Forest Results:")
        print(f"Best parameters: {self.rf_model.best_params_}")
        print(f"Best cross-validation score: {self.rf_model.best_score_:.3f}")
        
        # Get feature importance
        feature_importance = pd.DataFrame({
            'feature': self.X_train.columns,
            'importance': self.rf_model.best_estimator_.feature_importances_
        })
        feature_importance = feature_importance.sort_values('importance', ascending=False)
        
        return feature_importance
    
    def evaluate_models(self):
        """Evaluate both models on test set"""
        results = {}
        
        # Evaluate LASSO
        if self.lasso_model is not None:
            lasso_pred = self.lasso_model.predict(self.X_test_scaled)
            lasso_prob = self.lasso_model.predict_proba(self.X_test_scaled)[:, 1]
            
            results['lasso'] = {
                'predictions': lasso_pred,
                'probabilities': lasso_prob,
                'classification_report': classification_report(self.y_test, lasso_pred),
                'confusion_matrix': confusion_matrix(self.y_test, lasso_pred),
                'roc_auc': roc_auc_score(self.y_test, lasso_prob)
            }
        
        # Evaluate Random Forest
        if self.rf_model is not None:
            rf_pred = self.rf_model.predict(self.X_test_scaled)
            rf_prob = self.rf_model.predict_proba(self.X_test_scaled)[:, 1]
            
            results['random_forest'] = {
                'predictions': rf_pred,
                'probabilities': rf_prob,
                'classification_report': classification_report(self.y_test, rf_pred),
                'confusion_matrix': confusion_matrix(self.y_test, rf_pred),
                'roc_auc': roc_auc_score(self.y_test, rf_prob)
            }
        
        return results
    
    def plot_feature_importance(self, top_n=20):
        """Plot feature importance for both models"""
        if self.lasso_model is not None and self.rf_model is not None:
            # Get feature importance
            lasso_importance = self.train_lasso()
            rf_importance = self.train_random_forest()
            
            # Create subplots
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
            
            # Plot LASSO coefficients
            sns.barplot(data=lasso_importance.head(top_n), x='coefficient', y='feature', ax=ax1)
            ax1.set_title('Top LASSO Coefficients')
            
            # Plot Random Forest importance
            sns.barplot(data=rf_importance.head(top_n), x='importance', y='feature', ax=ax2)
            ax2.set_title('Top Random Forest Feature Importance')
            
            plt.tight_layout()
            plt.show()
    
    def save_results(self, output_dir):
        """Save model results and feature importance"""
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save feature importance
        if self.lasso_model is not None:
            lasso_importance = self.train_lasso()
            lasso_importance.to_csv(output_dir / 'lasso_feature_importance.csv', index=False)
        
        if self.rf_model is not None:
            rf_importance = self.train_random_forest()
            rf_importance.to_csv(output_dir / 'random_forest_feature_importance.csv', index=False)
        
        # Save evaluation results
        results = self.evaluate_models()
        for model_name, model_results in results.items():
            with open(output_dir / f'{model_name}_results.txt', 'w') as f:
                f.write(f"Classification Report:\n{model_results['classification_report']}\n")
                f.write(f"ROC AUC Score: {model_results['roc_auc']:.3f}\n")
                f.write("\nConfusion Matrix:\n")
                f.write(str(model_results['confusion_matrix']))

def main():
    # Load preprocessed data
    print("Loading preprocessed data...")
    df = load_preprocessed_data()
    
    # Set output directory
    output_dir = '../results/model_results'
    
    # Initialize predictor
    predictor = DepressionPredictor(df)
    
    # Train models
    print("Training LASSO model...")
    lasso_importance = predictor.train_lasso()
    
    print("\nTraining Random Forest model...")
    rf_importance = predictor.train_random_forest()
    
    # Evaluate models
    print("\nEvaluating models...")
    results = predictor.evaluate_models()
    
    # Plot feature importance
    print("\nPlotting feature importance...")
    predictor.plot_feature_importance()
    
    # Save results
    print("\nSaving results...")
    predictor.save_results(output_dir)
    
    print("\nDone!")

if __name__ == "__main__":
    main()

Loading preprocessed data...
Loaded data with shape: (10094, 2359)
Training LASSO model...


ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1231, in fit
    check_classification_targets(y)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 222, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.
