In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('data/train_preprocessed_fill_missing.csv')

In [3]:
print(df.shape)
df.head()

(22151, 3)


Unnamed: 0,id,cleaned_text,target
0,11098,post remove request member hi welcome immediat...,suicidal-thoughts-and-self-harm
1,116,hi nmtb thank post think lot people terrify st...,anxiety
2,7189,hello cas fair anxiety depression work lot com...,anxiety
3,4350,hey everyone discover another mum 's sister de...,anxiety
4,9749,hi everyone guess title say really .. 28 year ...,depression


In [4]:
X = np.load('data/train_embeddings.npy')

In [5]:
print(X.shape)
X[0:2]

(22151, 768)


array([[-0.40555438,  0.19110961,  0.14738172, ..., -0.621777  ,
        -0.08748772, -0.46268186],
       [-0.8397894 ,  0.08435682,  0.24780166, ..., -0.6236058 ,
         0.18323515, -0.0878297 ]], dtype=float32)

In [6]:
df['target'].value_counts()

target
relationship-and-family-issues     6688
anxiety                            6652
depression                         5836
ptsd-and-trauma                    1819
suicidal-thoughts-and-self-harm    1156
Name: count, dtype: int64

In [7]:
y = df['target']

# Target encoding

In [8]:
# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)


# Improved Cross validation

In [9]:
test = pd.read_csv('data/test_preprocessed_fill_missing.csv')
test_X = np.load('data/test_embeddings.npy')

In [10]:
final_preds = np.zeros(len(test))
final_preds_ensemble = np.zeros_like(final_preds)
final_preds.shape, final_preds_ensemble.shape

((2462,), (2462,))

In [12]:
# Initialize cross-validation
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
accuracy_scores = []
f1_macro_scores = []

# Perform 5-fold cross-validation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y_encoded)):
    print(f"\nProcessing Fold {fold + 1}/5")
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    # Calculate class weights
    classes = np.unique(y_train)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
    sample_weights = np.array([class_weights[i] for i in y_train])

    
    # CatBoost
    # cat = CatBoostClassifier(
    #     iterations=1000,
    #     learning_rate=0.05,
    #     depth=8,
    #     l2_leaf_reg=5,
    #     border_count=128,
    #     loss_function='MultiClass',
    #     eval_metric='Accuracy',
    #     verbose=500,
    #     random_state=42,
    #     task_type='GPU'  # Enable GPU
    # )
    # cat.fit(
    #     X_train, y_train,
    #     sample_weight=sample_weights,
    #     eval_set=(X_val, y_val),
    #     early_stopping_rounds=10
    # )
    
    # Initialize fast classifiers
    models = [
        ('rf', RandomForestClassifier(
            n_estimators=200,
            max_depth=15,
            class_weight='balanced',
            n_jobs=-1,
            random_state=42
        )),
        ('logreg', LogisticRegression(
            penalty='l2',
            C=1.0,
            class_weight='balanced',
            solver='lbfgs',
            max_iter=1000,
            random_state=42
        )),
        # ('svm', make_pipeline(
        #     LinearSVC(
        #         class_weight='balanced',
        #         dual=False,
        #         tol=1e-3,
        #         random_state=42
        #     )
        # )),
        ('sgd', SGDClassifier(
            loss='log_loss',  # Use log loss for probabilistic predictions
            class_weight='balanced',
            penalty='l2',
            alpha=1e-4,
            max_iter=1000,
            tol=1e-3,
            n_jobs=-1,
            random_state=42
        )),
        ('hgb', HistGradientBoostingClassifier(
            max_iter=200,
            learning_rate=0.1,
            max_depth=10,
            early_stopping=True,
            validation_fraction=0.2,
            random_state=42
        ))
    ]

    # Create voting ensemble
    ensemble = VotingClassifier(
        estimators=models,
        voting='soft',  # Use soft voting for probabilistic predictions
        n_jobs=-1
    )

    # Train ensemble
    print("Training ensemble...")
    ensemble.fit(X_train, y_train)

    # # Rebuild stacking ensemble WITHIN FOLD
    # base_models = [('ensemble', ensemble), ('cat', cat)]
    # stacker = StackingClassifier(
    #     estimators=base_models,
    #     final_estimator=MLPClassifier(
    #         hidden_layer_sizes=(32, 16),
    #         early_stopping=True,
    #         learning_rate='adaptive',
    #         # validation_fraction=0.1,  # Use 10% of training data for validation
    #         n_iter_no_change=10,      # Stop if no improvement for 10 epochs
    #         verbose=500              # Print progress
    #     ),
    #     stack_method='predict_proba',
    #     passthrough=True
    # )

    # # Train with sample weights
    # print("Training stacking ensemble...")
    # stacker.fit(X_train, y_train)

    # # Generate predictions
    # y_pred = stacker.predict(X_val)
    # Generate predictions from ensemble
    y_pred_ensemble = ensemble.predict(X_val)
    
    # # Calculate metrics
    # fold_acc = accuracy_score(y_val, y_pred)
    # fold_f1 = f1_score(y_val, y_pred, average='macro')
    
    fold_acc_ensemble = accuracy_score(y_val, y_pred_ensemble)
    fold_acc_ensemble = f1_score(y_val, y_pred_ensemble, average='macro')
    
    # accuracy_scores.append(fold_acc)
    # f1_macro_scores.append(fold_f1)
    
    # print(f"Fold {fold + 1} - Accuracy: {fold_acc:.4f}, F1 Macro: {fold_f1:.4f}")
    print(f"Fold Ensemble {fold + 1} - Accuracy: {fold_acc_ensemble:.4f}, F1 Macro: {fold_acc_ensemble:.4f}")
    
    # final_preds += stacker.predict(test_X)
    final_preds_ensemble += ensemble.predict(test_X)
    


Processing Fold 1/5
Training ensemble...


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [None]:
# Average the final predictions
final_preds /= cv.get_n_splits()

# Determine the final predicted classes
final_classes = np.argmax(final_preds, axis=1)

In [None]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test['id'],
    'target': label_encoder.inverse_transform(final_classes)
})

# Save submission file
submission_df.to_csv('data/submission_ensemble_bert_stack.csv', index=False)