In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

# Define a function for preprocessing and feature engineering
def preprocess_data(df_train_raw, df_test_raw):
    """
    Preprocesses training and testing data, performs feature engineering.
    """
    # Store test row_ids for submission
    test_row_ids_func = df_test_raw['row_id'].copy()

    # Add a temporary target column to test_df to match columns for concatenation
    # and to identify test rows later.
    df_test_with_target_placeholder = df_test_raw.copy()
    df_test_with_target_placeholder['new_policy_count'] = -1 # Placeholder value

    # Combine train and test data for consistent transformations
    combined_df = pd.concat([df_train_raw, df_test_with_target_placeholder], ignore_index=True)

    # --- Feature Engineering ---

    # 1. agent_code: Ensure it's treated as a categorical string
    combined_df['agent_code'] = combined_df['agent_code'].astype(str)

    # 2. Date columns processing
    date_cols_to_convert = ['agent_join_month', 'first_policy_sold_month', 'year_month']
    date_cols_dt = [col + '_dt' for col in date_cols_to_convert] # e.g., 'agent_join_month_dt'

    for col_orig, col_dt in zip(date_cols_to_convert, date_cols_dt):
        combined_df[col_dt] = pd.to_datetime(combined_df[col_orig], format='%m/%d/%Y')

    # Create new features from date differences
    combined_df['agent_tenure_days'] = (combined_df['year_month_dt'] - combined_df['agent_join_month_dt']).dt.days
    combined_df['days_join_to_first_sale'] = (combined_df['first_policy_sold_month_dt'] - combined_df['agent_join_month_dt']).dt.days
    combined_df['days_report_to_first_sale'] = (combined_df['first_policy_sold_month_dt'] - combined_df['year_month_dt']).dt.days

    # Extract components from 'year_month_dt' (the reporting period)
    combined_df['report_year'] = combined_df['year_month_dt'].dt.year
    combined_df['report_month'] = combined_df['year_month_dt'].dt.month
    combined_df['report_dayofyear'] = combined_df['year_month_dt'].dt.dayofyear
    combined_df['report_dayofweek'] = combined_df['year_month_dt'].dt.dayofweek

    # Drop original string date columns and intermediate datetime columns
    cols_to_drop_after_feature_eng = date_cols_to_convert + date_cols_dt
    combined_df = combined_df.drop(columns=cols_to_drop_after_feature_eng)

    # 3. Label encode all object type columns (this will include 'agent_code')
    categorical_cols = combined_df.select_dtypes(include='object').columns.tolist()
    for col in categorical_cols:
        le = LabelEncoder()
        combined_df[col] = le.fit_transform(combined_df[col])

    # Separate train and test data based on the placeholder
    train_processed = combined_df[combined_df['new_policy_count'] != -1].copy()
    test_processed = combined_df[combined_df['new_policy_count'] == -1].copy()

    # Prepare final datasets for modeling
    y_train_processed = train_processed['new_policy_count'].astype(float) # Target variable
    train_processed = train_processed.drop(columns=['new_policy_count'])
    test_processed = test_processed.drop(columns=['new_policy_count'])

    return train_processed, y_train_processed, test_processed, test_row_ids_func


# --- Main script execution ---
if __name__ == '__main__':
    # Load data
    try:
        train_df_raw = pd.read_csv("train_storming_round.csv")
        test_df_raw = pd.read_csv("test_storming_round.csv")
        # sample_submission_df = pd.read_csv("sample_submission_storming_round.csv") # Not strictly needed for script logic
    except FileNotFoundError as e:
        print(f"Error loading data: {e}. Make sure the CSV files are in the same directory as the script.")
        exit()

    # Preprocess data and engineer features
    print("Preprocessing data...")
    X_train, y_train, X_test, test_row_ids = preprocess_data(train_df_raw, test_df_raw)
    print("Preprocessing complete.")

    # 'row_id' is an identifier, not a feature for training.
    # It was already handled for test_row_ids, but ensure it's not in X_train/X_test.
    if 'row_id' in X_train.columns:
        X_train = X_train.drop(columns=['row_id'])
    if 'row_id' in X_test.columns:
        X_test = X_test.drop(columns=['row_id'])

    features = X_train.columns.tolist()

    # --- Model Training (LightGBM) ---
    # LightGBM parameters (can be tuned for better performance)
    lgb_params = {
        'objective': 'regression_l1',  # Mean Absolute Error (robust to outliers)
        'metric': 'mae',               # Metric to monitor
        'n_estimators': 2000,          # Number of boosting rounds
        'learning_rate': 0.01,         # Step size shrinkage
        'feature_fraction': 0.8,       # Fraction of features to use for each tree
        'bagging_fraction': 0.8,       # Fraction of data to use for each tree (requires bagging_freq > 0)
        'bagging_freq': 1,             # Frequency for bagging
        'lambda_l1': 0.1,              # L1 regularization
        'lambda_l2': 0.1,              # L2 regularization
        'num_leaves': 31,              # Max number of leaves in one tree (default: 31)
        'verbose': -1,                 # Suppress LightGBM's own verbosity
        'n_jobs': -1,                  # Use all available cores
        'seed': 42,                    # Random seed for reproducibility
        'boosting_type': 'gbdt',       # Gradient Boosting Decision Tree
    }

    model = lgb.LGBMRegressor(**lgb_params)

    print("Starting model training...")
    # Using early stopping. eval_set ideally would be a separate validation set.
    # Here, for simplicity, it monitors performance on the training data itself.
    # This helps to stop if the model stops improving on train data, given large n_estimators.
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train)],
              eval_metric='mae',
              callbacks=[lgb.early_stopping(100, verbose=100)]) # Stop if MAE doesn't improve for 100 rounds, print every 100 rounds.
    print("Model training finished.")

    # --- Prediction ---
    print("Making predictions on the test set...")
    predictions_test = model.predict(X_test)

    # Post-processing predictions:
    # 1. Ensure non-negativity (policy count cannot be negative)
    predictions_test = np.maximum(0, predictions_test)
    # 2. Round to nearest integer and convert to int type (policy count is a whole number)
    predictions_test = np.round(predictions_test).astype(int)
    print("Predictions made and post-processed.")

    # --- Create Submission File ---
    submission_df = pd.DataFrame({
        'row_id': test_row_ids,
        'new_policy_count': predictions_test
    })

    submission_filename = "submission.csv"
    submission_df.to_csv(submission_filename, index=False)
    print(f"Submission file created successfully: {submission_filename}")

    # --- Optional: Display Feature Importances ---
    if hasattr(model, 'feature_importances_'):
        feature_importance_df = pd.DataFrame({
            'feature': features,
            'importance': model.feature_importances_
        }).sort_values(by='importance', ascending=False)

        print("\nTop 20 Feature Importances:")
        print(feature_importance_df.head(20))

Preprocessing data...
Preprocessing complete.
Starting model training...




Model training finished.
Making predictions on the test set...
Predictions made and post-processed.
Submission file created successfully: submission.csv

Top 20 Feature Importances:
                            feature  importance
17  number_of_cash_payment_policies       10765
5                   unique_proposal        6381
14                       ANBP_value        5100
15                       net_income        4134
0                        agent_code        3266
18                agent_tenure_days        3101
20        days_report_to_first_sale        2780
4     unique_proposals_last_21_days        2731
19          days_join_to_first_sale        2419
1                         agent_age        2338
16         number_of_policy_holders        2171
9                 unique_quotations        1849
3     unique_proposals_last_15_days        1807
13                 unique_customers        1511
12    unique_customers_last_21_days        1253
11    unique_customers_last_15_days        1113
8 