In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ShuffleSplit

from scipy.sparse import save_npz, load_npz

In [2]:
df = pd.read_csv('data/train_preprocessed_fill_missing.csv')

In [3]:
print(df.shape)
df.head()

(22151, 3)


Unnamed: 0,id,cleaned_text,target
0,11098,post remove request member hi welcome immediat...,suicidal-thoughts-and-self-harm
1,116,hi nmtb thank post think lot people terrify st...,anxiety
2,7189,hello cas fair anxiety depression work lot com...,anxiety
3,4350,hey everyone discover another mum 's sister de...,anxiety
4,9749,hi everyone guess title say really .. 28 year ...,depression


In [4]:
X = load_npz('data/train_tfidf_embeddings.npz')

In [5]:
print(X.shape)
X[0:2]

(22151, 4096)


<2x4096 sparse matrix of type '<class 'numpy.float64'>'
	with 59 stored elements in Compressed Sparse Row format>

In [6]:
df['target'].value_counts()

target
relationship-and-family-issues     6688
anxiety                            6652
depression                         5836
ptsd-and-trauma                    1819
suicidal-thoughts-and-self-harm    1156
Name: count, dtype: int64

In [7]:
y = df['target']

# Target encoding

In [8]:
# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)


# Improved Cross validation

In [9]:
test = pd.read_csv('data/test_preprocessed_fill_missing.csv')
test_X = load_npz('data/test_tfidf_embeddings.npz')

In [10]:
final_preds = np.zeros(len(test))
final_preds.shape

(2462,)

In [12]:
# Initialize cross-validation
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
accuracy_scores = []
f1_macro_scores = []

# Perform 5-fold cross-validation
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y_encoded)):
    print(f"\nProcessing Fold {fold + 1}/5")
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    # Calculate class weights
    classes = np.unique(y_train)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
    sample_weights = np.array([class_weights[i] for i in y_train])

    # Initialize models with common parameters
    common_tree_params = {
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'max_depth': 3,
        'random_state': 42
    }
    
    # LightGBM
    lgbm = LGBMClassifier(
        **common_tree_params,
        objective='multiclass',
        num_class=num_classes,
        force_col_wise=True,
        verbose=-1,
        device='gpu'  # Enable GPU
    )
    lgbm.fit(
        X_train, y_train,
        sample_weight=sample_weights,
        eval_set=[(X_val, y_val)],
        eval_metric='multi_error',
        # early_stopping_rounds=10,
        
    )
    
    # XGBoost
    xgb = XGBClassifier(
        **common_tree_params,
        objective='multi:softprob',
        num_class=num_classes,
        eval_metric=['merror', 'mlogloss'],
        tree_method='hist',
        device='cuda'# Enable GPU
    )
    xgb.fit(
        X_train, y_train,
        sample_weight=sample_weights,
        eval_set=[(X_val, y_val)],
        # early_stopping_rounds=10,
        verbose=500
    )
    
    # CatBoost
    cat = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=8,
        l2_leaf_reg=5,
        border_count=128,
        loss_function='MultiClass',
        eval_metric='Accuracy',
        verbose=500,
        random_state=42,
        task_type='GPU'  # Enable GPU
    )
    cat.fit(
        X_train, y_train,
        sample_weight=sample_weights,
        eval_set=(X_val, y_val),
        early_stopping_rounds=10
    )

    # Rebuild stacking ensemble WITHIN FOLD
    base_models = [('lgbm', lgbm), ('xgb', xgb), ('cat', cat)]
    stacker = StackingClassifier(
        estimators=base_models,
        final_estimator=MLPClassifier(
            hidden_layer_sizes=(64, 32),
            early_stopping=True,
            learning_rate='adaptive',
            # validation_fraction=0.1,  # Use 10% of training data for validation
            n_iter_no_change=10,      # Stop if no improvement for 10 epochs
            verbose=500              # Print progress
        ),
        stack_method='predict_proba',
        passthrough=True
    )

    # Train with sample weights
    print("Training stacking ensemble...")
    stacker.fit(X_train, y_train, 
                sample_weight=sample_weights)

    # Generate predictions
    y_pred = stacker.predict(X_val)
    
    # Calculate metrics
    fold_acc = accuracy_score(y_val, y_pred)
    fold_f1 = f1_score(y_val, y_pred, average='macro')
    
    accuracy_scores.append(fold_acc)
    f1_macro_scores.append(fold_f1)
    
    print(f"Fold {fold + 1} - Accuracy: {fold_acc:.4f}, F1 Macro: {fold_f1:.4f}")
    
    final_preds += stacker.predict(test_X)
    


Processing Fold 1/5




[0]	validation_0-merror:0.61183	validation_0-mlogloss:1.59784
[500]	validation_0-merror:0.39562	validation_0-mlogloss:1.10577
[999]	validation_0-merror:0.38321	validation_0-mlogloss:1.04149
0:	learn: 0.3485786	test: 0.4012638	best: 0.4012638 (0)	total: 487ms	remaining: 8m 6s
bestTest = 0.5605958023
bestIteration = 110
Shrink model to first 111 iterations.
Training stacking ensemble...




KeyboardInterrupt: 

In [None]:
# Average the final predictions
final_preds /= cv.get_n_splits()

# Determine the final predicted classes
final_classes = np.argmax(final_preds, axis=1)

label_encoder.inverse_transform(final_classes)

In [None]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test['id'],
    'target': label_encoder.inverse_transform(final_classes)
})

# Save submission file
submission_df.to_csv('data/submission_ensemble_tf_idf.csv', index=False)