# LightGBM

For: NEIL HEINRICH BRAUN

The idea for this file is to train a lightGBM model given the dataset. The data files you will need to import is unfortunately not ready. But for now, write and test the code using `model_building_data.csv` which is provided in the data folder. Keep in mind that the final training/testing files will have more fields.

Great thing about LightGBM is it can handle missing data as is. Also, LightGBM can also handle data with weird ranges better than compared to neural network based models or even models like SVM regressor. So in practice, you dont need to do much processing because the data file should already have appropriate data for you to use.

Of course this does not mean you should not do any processing at all. In fact, you should explore dimension reduction techniques and do feature selection where appropriate. Also, some columns might have too many NaNs and should be remove entirely.

Furthermore, LightGBM is *not* a timeseries model. Therefore, you should engineer lagged variables for prediction as well.

Last thing to keep in mind is, some rows might have missing revenue but non-missing CAR etc. If you will drop NaNs, drop for each y values differently to prevent unnecessary data loss.

Tune all parameters using 3-fold CV with the timesplit function like in assignment 1. I'll write a different time split function and we'll rerun with 5-10 fold CV again later before submission.

This file should save the output of the prediction in the format:

| ticker | quarter_year  | log_revenue_prediction | CAR_prediction |
|--------|---------------|------------------------|----------------|
| BAC    | Q1 2001       | 123                    | 0.5            |
| JPM    | Q1 2001       | 456                    | 0.8            |
| WFC    | Q1 2001       | 789                    | 0.25           |

Enjoy!

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from typing import List, Dict, Tuple, Union, Optional
import joblib
import os
import re

warnings.filterwarnings("ignore")

# Create directory for models if it doesn't exist
os.makedirs('models', exist_ok=True)

In [None]:
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean column names to remove special characters that might cause issues with LightGBM
    
    Args:
        df: DataFrame with original column names
        
    Returns:
        DataFrame with cleaned column names
    """
    # Create a mapping of original to clean column names
    column_mapping = {}
    for col in df.columns:
        # Replace special characters with underscore
        clean_col = re.sub(r'[^\w\s]', '_', col)
        # Replace spaces with underscore
        clean_col = re.sub(r'\s+', '_', clean_col)
        # Ensure unique column names
        if clean_col in column_mapping.values():
            i = 1
            while f"{clean_col}_{i}" in column_mapping.values():
                i += 1
            clean_col = f"{clean_col}_{i}"
        column_mapping[col] = clean_col
    
    # Create a copy of the DataFrame with cleaned column names
    df_clean = df.copy()
    df_clean.columns = [column_mapping[col] for col in df.columns]
    
    # Store the column mapping for later reference
    df_clean.attrs['column_mapping'] = column_mapping
    
    return df_clean

In [None]:
def load_data(filepath: str) -> pd.DataFrame:
    """
    Load the data from a CSV file and do initial preprocessing
    
    Args:
        filepath: Path to the CSV file
        
    Returns:
        Processed DataFrame
    """
    # Load the data
    df = pd.read_csv(filepath)
    
    # Parse the quarter from datacqtr
    # Example: Convert '2000Q1' to quarter=1, year=2000
    df['year'] = df['datacqtr'].str.extract(r'(\d{4})').astype(int)
    df['quarter'] = df['datacqtr'].str.extract(r'Q(\d)').astype(int)
    
    # Create a numerical representation of time for sorting
    df['time_id'] = df['year'] * 4 + df['quarter']
    
    # Keep the original quarter_year format for output
    df['quarter_year'] = df['datacqtr'].apply(lambda x: f"Q{x[-1]} {x[:4]}")
    
    # Clean column names
    df = clean_column_names(df)
    
    # Print some basic information about the data
    print(f"DataFrame shape: {df.shape}")
    print(f"Number of unique companies: {df['tic'].nunique()}")
    print(f"Time period: {df['datacqtr'].min()} to {df['datacqtr'].max()}")
    
    return df

In [None]:
def check_missing_values(df: pd.DataFrame, threshold: float = 0.7) -> Tuple[List[str], pd.DataFrame]:
    """
    Check for missing values in each column and remove columns with too many missing values
    
    Args:
        df: DataFrame to check
        threshold: Maximum percentage of missing values allowed
        
    Returns:
        List of columns to keep and cleaned DataFrame
    """
    # Calculate percentage of missing values in each column
    missing_percent = df.isnull().mean()
    
    # Print columns with missing values
    print("Columns with missing values:")
    for col in missing_percent[missing_percent > 0].index:
        print(f"{col}: {missing_percent[col]*100:.2f}%")
    
    # Identify columns to keep (with missing values below threshold)
    # Always keep time_id, tic, and quarter_year columns
    must_keep = ['time_id', 'tic', 'quarter_year']
    cols_with_low_missing = missing_percent[missing_percent < threshold].index.tolist()
    cols_to_keep = list(set(must_keep + cols_with_low_missing))
    
    # Remove columns with too many missing values
    df_cleaned = df[cols_to_keep].copy()
    
    return cols_to_keep, df_cleaned

In [None]:
def create_lagged_features(df: pd.DataFrame, lag_columns: List[str], lag_periods: List[int]) -> pd.DataFrame:
    """
    Create lagged features for specified columns
    
    Args:
        df: DataFrame to create lagged features for
        lag_columns: List of columns to create lags for
        lag_periods: List of lag periods
        
    Returns:
        DataFrame with lagged features
    """
    # Sort the data by ticker and time
    df = df.sort_values(['tic', 'time_id'])
    
    # Create lagged features
    for col in lag_columns:
        if col in df.columns:  # Only process columns that exist
            for lag in lag_periods:
                lag_col_name = f"{col}_lag_{lag}"
                df[lag_col_name] = df.groupby('tic')[col].shift(lag)
    
    # Calculate rate of change features
    for col in lag_columns:
        if col in df.columns:  # Only process columns that exist
            for lag in lag_periods:
                if lag > 0:
                    # Check if lag column exists
                    lag_col_name = f"{col}_lag_{lag}"
                    if lag_col_name in df.columns:
                        # Use safe division to avoid divide by zero issues
                        roc_col_name = f"{col}_roc_{lag}"
                        df[roc_col_name] = df.groupby('tic').apply(
                            lambda x: (x[col] - x[lag_col_name]) / 
                            x[lag_col_name].replace(0, np.nan)
                        ).reset_index(level=0, drop=True)
    
    # Calculate rolling means
    for col in lag_columns:
        if col in df.columns:  # Only process columns that exist
            for window in [2, 4]:
                roll_col_name = f"{col}_rolling_mean_{window}"
                df[roll_col_name] = df.groupby('tic')[col].transform(
                    lambda x: x.rolling(window=window, min_periods=1).mean())
    
    return df

In [None]:
def prepare_data_for_training(df: pd.DataFrame, target_col: str, drop_cols: List[str]) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Prepare data for training by separating features and target
    
    Args:
        df: DataFrame to prepare
        target_col: Name of the target column
        drop_cols: List of columns to drop from features
        
    Returns:
        X and y for training
    """
    # Separate features and target
    y = df[target_col].copy()
    
    # Columns to drop from features
    cols_to_drop = drop_cols.copy()
    if target_col not in cols_to_drop:
        cols_to_drop.append(target_col)
    
    # Map revenue and CAR column names
    revenue_col = [col for col in df.columns if 'Total_Current_Operating_Revenue' in col]
    car_col = [col for col in df.columns if 'car5' in col]
    
    # Drop target-related columns
    if revenue_col and target_col not in revenue_col:
        cols_to_drop.extend(revenue_col)
    if car_col and target_col not in car_col:
        cols_to_drop.extend(car_col)
    
    # Create feature matrix
    X = df.drop(columns=cols_to_drop, errors='ignore')
    
    # Handle categorical features
    cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    for col in cat_features:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
    
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    return X, y

In [None]:
def train_lightgbm_model(X: pd.DataFrame, y: pd.Series, n_splits: int = 3, 
                        params: Optional[Dict] = None) -> Tuple[lgb.Booster, float, Dict]:
    """
    Train a LightGBM model with time series cross-validation
    
    Args:
        X: Feature matrix
        y: Target variable
        n_splits: Number of splits for cross-validation
        params: LightGBM parameters
        
    Returns:
        Trained model, average validation score, and feature importances
    """
    if params is None:
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'verbose': -1
        }
    
    # Create time series split
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    # Keep track of validation scores
    val_scores = []
    models = []
    feature_importances = {}
    
    # Initialize feature importances
    for feature in X.columns:
        feature_importances[feature] = 0
    
    # Select index (time_id if available, or use range index)
    if 'time_id' in X.columns:
        sorted_indices = X.sort_values('time_id').index
    else:
        sorted_indices = np.arange(len(X))
    
    # Train model with each fold
    for i, (train_idx, val_idx) in enumerate(tscv.split(X)):
        # Map indices back to original dataframe indices
        if len(train_idx) > 0 and len(val_idx) > 0:
            try:
                actual_train_idx = sorted_indices[train_idx]
                actual_val_idx = sorted_indices[val_idx]
                
                X_train, y_train = X.loc[actual_train_idx], y.loc[actual_train_idx]
                X_val, y_val = X.loc[actual_val_idx], y.loc[actual_val_idx]
                
                # Create LightGBM datasets
                train_data = lgb.Dataset(X_train, label=y_train)
                val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
                
                # Set early stopping parameters
                callbacks = [
                    lgb.early_stopping(stopping_rounds=50, verbose=False),
                    lgb.log_evaluation(period=0)  # Disable logging
                ]
                
                # Train model
                model = lgb.train(
                    params, 
                    train_data,
                    valid_sets=[train_data, val_data],
                    valid_names=['train', 'valid'],
                    num_boost_round=500,
                    callbacks=callbacks
                )
                
                # Make predictions
                val_preds = model.predict(X_val)
                val_score = mean_squared_error(y_val, val_preds, squared=False)  # RMSE
                val_scores.append(val_score)
                
                # Store model and update feature importances
                models.append(model)
                importances = model.feature_importance()
                for j, feature in enumerate(X.columns):
                    if j < len(importances):
                        feature_importances[feature] += importances[j] / n_splits
                
                print(f"Fold {i+1}/{n_splits}: RMSE = {val_score:.4f}")
            except Exception as e:
                print(f"Error in fold {i+1}: {e}")
                continue
    
    if not models:
        raise ValueError("No models were successfully trained during cross-validation")
    
    # Calculate average validation score
    avg_val_score = np.mean(val_scores)
    print(f"Average RMSE: {avg_val_score:.4f}")
    
    # Return the best model (lowest validation error)
    best_model_idx = np.argmin(val_scores)
    best_model = models[best_model_idx]
    
    return best_model, avg_val_score, feature_importances

In [None]:
def train_final_model(X: pd.DataFrame, y: pd.Series, params: Dict) -> lgb.Booster:
    """
    Train a final model on all data
    
    Args:
        X: Feature matrix
        y: Target variable
        params: Model parameters
        
    Returns:
        Trained model
    """
    # Create LightGBM dataset
    train_data = lgb.Dataset(X, label=y)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=500
    )
    
    return model

In [None]:
def run_revenue_prediction(df: pd.DataFrame) -> pd.DataFrame:
    """
    Run the full revenue prediction pipeline
    
    Args:
        df: Input DataFrame
        
    Returns:
        DataFrame with predictions
    """
    print("\n=== Revenue Prediction ===")
    
    # Identify target column (using partial match since column names were cleaned)
    target_cols = [col for col in df.columns if 'Total_Current_Operating_Revenue' in col]
    if not target_cols:
        raise ValueError("Could not find revenue target column in the dataset")
    target_col = target_cols[0]
    
    # Drop rows with missing target values
    df_revenue = df.dropna(subset=[target_col]).copy()
    
    # Define columns for lag features (excluding target, identifiers, and other Y variables)
    excluded_patterns = ['tic', 'datacqtr', 'quarter', 'year', 'time_id', 'quarter_year', 
                       'Total_Current_Operating_Revenue', 'car5']
    
    lag_columns = []
    for col in df_revenue.columns:
        if not any(pattern in col for pattern in excluded_patterns):
            lag_columns.append(col)
    
    # Create lagged features (for revenue prediction, we need to lag the target variable)
    lag_periods = [1, 2, 4]  # 1, 2, 4 quarters lag
    lag_columns.append(target_col)
    df_revenue = create_lagged_features(df_revenue, lag_columns, lag_periods)
    
    # Drop rows with NaN values for any lag features (typically the first few quarters for each ticker)
    lag_col = f"{target_col}_lag_1"
    if lag_col in df_revenue.columns:
        df_revenue = df_revenue.dropna(subset=[lag_col]).copy()
    
    # Check missing values and clean data
    _, df_revenue_cleaned = check_missing_values(df_revenue)
    
    # Prepare data for training
    X, y = prepare_data_for_training(
        df_revenue_cleaned, 
        target_col, 
        drop_cols=['tic', 'datacqtr', 'quarter_year', 'quarter', 'year']
    )
    
    # Set parameters for LightGBM (simplified for faster execution)
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'verbose': -1
    }
    
    # Train the model with cross-validation
    cv_model, _, feature_importances = train_lightgbm_model(X, y, n_splits=3, params=params)
    
    # Train the final model on all data
    final_model = train_final_model(X, y, params)
    
    # Save the model
    joblib.dump(final_model, 'models/lightgbm_revenue_model.pkl')
    
    # Make predictions on the entire dataset
    df_revenue_cleaned['log_revenue_prediction'] = final_model.predict(X)
    
    # Create output DataFrame
    output_df_revenue = df_revenue_cleaned[['tic', 'quarter_year', 'log_revenue_prediction']].copy()
    
    # Plot feature importances
    try:
        plt.figure(figsize=(12, 8))
        feature_imp = pd.DataFrame(sorted(feature_importances.items(), key=lambda x: x[1], reverse=True), 
                                columns=['Feature', 'Importance'])
        if len(feature_imp) > 20:
            feature_imp = feature_imp.iloc[:20]
        plt.barh(feature_imp['Feature'], feature_imp['Importance'])
        plt.title('Feature Importance for Revenue Prediction')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.savefig('revenue_feature_importance.png')
    except Exception as e:
        print(f"Warning: Could not plot feature importance: {e}")
    
    return output_df_revenue

In [None]:
def run_car_prediction(df: pd.DataFrame) -> pd.DataFrame:
    """
    Run the full CAR prediction pipeline
    
    Args:
        df: Input DataFrame
        
    Returns:
        DataFrame with predictions
    """
    print("\n=== CAR Prediction ===")
    
    # Identify target column (using partial match since column names were cleaned)
    target_cols = [col for col in df.columns if 'car5' in col]
    if not target_cols:
        raise ValueError("Could not find CAR target column in the dataset")
    target_col = target_cols[0]
    
    # Drop rows with missing target values
    df_car = df.dropna(subset=[target_col]).copy()
    
    # Define columns for lag features (excluding target, identifiers, and other Y variables)
    excluded_patterns = ['tic', 'datacqtr', 'quarter', 'year', 'time_id', 'quarter_year', 
                        'Total_Current_Operating_Revenue', 'car5']
    
    lag_columns = []
    for col in df_car.columns:
        if not any(pattern in col for pattern in excluded_patterns):
            lag_columns.append(col)
    
    # Create lagged features (for CAR, we don't need to lag the target variable)
    lag_periods = [1, 2, 4]  # 1, 2, 4 quarters lag
    df_car = create_lagged_features(df_car, lag_columns, lag_periods)
    
    # For CAR prediction, we don't need to have lagged features of the target
    # So we don't filter based on lag features, just use the rows with non-missing CAR
    
    # Check missing values and clean data
    _, df_car_cleaned = check_missing_values(df_car)
    
    # Prepare data for training
    X, y = prepare_data_for_training(
        df_car_cleaned, 
        target_col, 
        drop_cols=['tic', 'datacqtr', 'quarter_year', 'quarter', 'year']
    )
    
    # Set parameters for LightGBM (simplified for faster execution)
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'verbose': -1
    }
    
    # Train the model with cross-validation
    cv_model, _, feature_importances = train_lightgbm_model(X, y, n_splits=3, params=params)
    
    # Train the final model on all data
    final_model = train_final_model(X, y, params)
    
    # Save the model
    joblib.dump(final_model, 'models/lightgbm_car_model.pkl')
    
    # Make predictions on the entire dataset
    df_car_cleaned['CAR_prediction'] = final_model.predict(X)
    
    # Create output DataFrame
    output_df_car = df_car_cleaned[['tic', 'quarter_year', 'CAR_prediction']].copy()
    
    # Plot feature importances
    try:
        plt.figure(figsize=(12, 8))
        feature_imp = pd.DataFrame(sorted(feature_importances.items(), key=lambda x: x[1], reverse=True), 
                                columns=['Feature', 'Importance'])
        if len(feature_imp) > 20:
            feature_imp = feature_imp.iloc[:20]
        plt.barh(feature_imp['Feature'], feature_imp['Importance'])
        plt.title('Feature Importance for CAR Prediction')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.savefig('car_feature_importance.png')
    except Exception as e:
        print(f"Warning: Could not plot feature importance: {e}")
    
    return output_df_car

In [None]:
def combine_predictions(revenue_predictions: pd.DataFrame, car_predictions: pd.DataFrame) -> pd.DataFrame:
    """
    Combine revenue and CAR predictions into a single DataFrame
    
    Args:
        revenue_predictions: DataFrame with revenue predictions
        car_predictions: DataFrame with CAR predictions
        
    Returns:
        Combined DataFrame
    """
    # Merge the two DataFrames
    combined_df = pd.merge(
        revenue_predictions, 
        car_predictions, 
        on=['tic', 'quarter_year'], 
        how='outer'
    )
    
    # Return the combined DataFrame
    return combined_df

In [None]:
def main():
    """
    Main function to run the entire pipeline
    """
    # Load data
    df = load_data('data/model_building_data.csv')
    
    try:
        # Run revenue prediction
        revenue_predictions = run_revenue_prediction(df)
        
        # Run CAR prediction
        car_predictions = run_car_prediction(df)
        
        # Combine predictions
        combined_predictions = combine_predictions(revenue_predictions, car_predictions)
        
        # Save the final output
        combined_predictions.to_csv('lightgbm_predictions.csv', index=False)
        
        print("\nPipeline completed. Predictions saved to 'lightgbm_predictions.csv'")
        
        return combined_predictions
    
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    final_predictions = main()
    
    # Display a sample of the predictions
    if final_predictions is not None:
        print("\nSample predictions:")
        print(final_predictions.head(10))