In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight

In [13]:
df = pd.read_csv('data/train_preprocessed_fill_missing.csv')

In [14]:
print(df.shape)
df.head()

(22151, 3)


Unnamed: 0,id,cleaned_text,target
0,11098,post remove request member hi welcome immediat...,suicidal-thoughts-and-self-harm
1,116,hi nmtb thank post think lot people terrify st...,anxiety
2,7189,hello cas fair anxiety depression work lot com...,anxiety
3,4350,hey everyone discover another mum 's sister de...,anxiety
4,9749,hi everyone guess title say really .. 28 year ...,depression


In [15]:
X = np.load('data/train_embeddings.npy')

In [16]:
print(X.shape)
X[0:2]

(22151, 768)


array([[-0.40555438,  0.19110961,  0.14738172, ..., -0.621777  ,
        -0.08748772, -0.46268186],
       [-0.8397894 ,  0.08435682,  0.24780166, ..., -0.6236058 ,
         0.18323515, -0.0878297 ]], dtype=float32)

In [18]:
df['target'].value_counts()

target
relationship-and-family-issues     6688
anxiety                            6652
depression                         5836
ptsd-and-trauma                    1819
suicidal-thoughts-and-self-harm    1156
Name: count, dtype: int64

In [20]:
y = df['target']

# Target encoding

In [None]:
# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)


array([4, 0, 0, ..., 0, 2, 1])

# Cross validation

In [23]:
# Initialize cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_macro_scores = []

# Perform 5-fold cross-validation
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
    print(f"\nProcessing Fold {fold + 1}/5")
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    # Calculate class weights
    classes = np.unique(y_train)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
    sample_weights = np.array([class_weights[i] for i in y_train])
    
    # Initialize models with class weights
    lgbm = LGBMClassifier(
        objective='multiclass',
        num_class=num_classes,
        class_weight={i: w for i, w in enumerate(class_weights)},
        random_state=42,
        force_col_wise=True,
        device='gpu'  # Enable GPU
    )
    
    xgb = XGBClassifier(
        objective='multi:softprob',
        num_class=num_classes,
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42,
        tree_method='gpu_hist'  # Enable GPU
    )
    
    cat = CatBoostClassifier(
        loss_function='MultiClass',
        class_weights=list(class_weights),
        verbose=0,
        random_state=42,
        task_type='GPU'  # Enable GPU
    )
    
    # Train models
    print("Training models...")
    lgbm.fit(X_train, y_train)
    xgb.fit(X_train, y_train, sample_weight=sample_weights)
    cat.fit(X_train, y_train)
    
    # Generate predictions
    lgbm_proba = lgbm.predict_proba(X_val)
    xgb_proba = xgb.predict_proba(X_val)
    cat_proba = cat.predict_proba(X_val)
    
    # Ensemble predictions (average probabilities)
    ensemble_proba = (lgbm_proba + xgb_proba + cat_proba) / 3
    ensemble_preds = np.argmax(ensemble_proba, axis=1)
    
    # Calculate metrics
    fold_acc = accuracy_score(y_val, ensemble_preds)
    fold_f1 = f1_score(y_val, ensemble_preds, average='macro')
    
    accuracy_scores.append(fold_acc)
    f1_macro_scores.append(fold_f1)
    
    print(f"Fold {fold + 1} - Accuracy: {fold_acc:.4f}, F1 Macro: {fold_f1:.4f}")

# Print cross-validation results
print("\nCross-Validation Results:")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f} (+/- {np.std(accuracy_scores):.4f})")
print(f"Average F1 Macro: {np.mean(f1_macro_scores):.4f} (+/- {np.std(f1_macro_scores):.4f})")

# Train final ensemble on full dataset for test set predictions
print("\nTraining final ensemble on full dataset...")
final_class_weights = compute_class_weight('balanced', classes=np.unique(y_encoded), y=y_encoded)
final_sample_weights = np.array([final_class_weights[i] for i in y_encoded])

final_lgbm = LGBMClassifier(
    objective='multiclass',
    num_class=num_classes,
    class_weight={i: w for i, w in enumerate(final_class_weights)},
    random_state=42
).fit(X, y_encoded)

final_xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=num_classes,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
).fit(X, y_encoded, sample_weight=final_sample_weights)

final_cat = CatBoostClassifier(
    loss_function='MultiClass',
    class_weights=list(final_class_weights),
    verbose=0,
    random_state=42
).fit(X, y_encoded)


Processing Fold 1/5
Training models...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.183052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 17720, number of used features: 768
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


Parameters: { "use_label_encoder" } are not used.



Fold 1 - Accuracy: 0.6822, F1 Macro: 0.6050

Processing Fold 2/5
Training models...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.110003 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 17721, number of used features: 768
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


Parameters: { "use_label_encoder" } are not used.



Fold 2 - Accuracy: 0.6937, F1 Macro: 0.6018

Processing Fold 3/5
Training models...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.172675 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 17721, number of used features: 768
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


Parameters: { "use_label_encoder" } are not used.



Fold 3 - Accuracy: 0.6986, F1 Macro: 0.6121

Processing Fold 4/5
Training models...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.166806 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 17721, number of used features: 768
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


Parameters: { "use_label_encoder" } are not used.



Fold 4 - Accuracy: 0.6966, F1 Macro: 0.6017

Processing Fold 5/5
Training models...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.138599 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 17721, number of used features: 768
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


Parameters: { "use_label_encoder" } are not used.



Fold 5 - Accuracy: 0.6916, F1 Macro: 0.6057

Cross-Validation Results:
Average Accuracy: 0.6926 (+/- 0.0057)
Average F1 Macro: 0.6053 (+/- 0.0038)

Training final ensemble on full dataset...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.231126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 22151, number of used features: 768
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


Parameters: { "use_label_encoder" } are not used.



# Test set predictions

In [25]:
test = pd.read_csv('data/test_preprocessed_fill_missing.csv')
test_X = np.load('data/test_embeddings.npy')

In [26]:
# Example usage for test set predictions:
test_preds_lgbm = final_lgbm.predict_proba(test_X)
test_preds_xgb = final_xgb.predict_proba(test_X)
test_preds_cat = final_cat.predict_proba(test_X)
ensemble_preds = (test_preds_lgbm + test_preds_xgb + test_preds_cat) / 3
final_predictions = label_encoder.inverse_transform(np.argmax(ensemble_preds, axis=1))

In [28]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test['id'],
    'target': final_predictions
})

# Save submission file
submission_df.to_csv('data/submission_ensemble_raw.csv', index=False)