In [1]:
!pip install mlflow boto3 awscli

Collecting mlflow
  Downloading mlflow-3.5.1-py3-none-any.whl.metadata (30 kB)
Collecting boto3
  Downloading boto3-1.40.60-py3-none-any.whl.metadata (6.6 kB)
Collecting awscli
  Downloading awscli-1.42.60-py3-none-any.whl.metadata (11 kB)
Collecting mlflow-skinny==3.5.1 (from mlflow)
  Downloading mlflow_skinny-3.5.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.5.1 (from mlflow)
  Downloading mlflow_tracing-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.5.1->mlflow)
  Downloading databricks_sdk-0.70.0-py3-non

In [2]:
!aws configure

AWS Access Key ID [None]: AKIA6NO2FLEIPSQV3CRG
AWS Secret Access Key [None]: 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
Default region name [None]: eu-west-1
Default output format [None]: 


In [3]:
import mlflow
# Set up the Mlflow tracing server
mlflow.set_tracking_uri("http://ec2-3-253-113-244.eu-west-1.compute.amazonaws.com:5000")

In [4]:
# Set or create an experiment
mlflow.set_experiment("Exp 6 - StackingClassifier")

2025/10/28 14:46:49 INFO mlflow.tracking.fluent: Experiment with name 'Exp 6 - StackingClassifier' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-102325/287934960029108411', creation_time=1761662809998, experiment_id='287934960029108411', last_update_time=1761662809998, lifecycle_stage='active', name='Exp 6 - StackingClassifier', tags={}>

In [7]:
# importing important libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE


In [6]:
# load the preprocessed data
df = pd.read_csv('Reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36241, 2)

Model training using Stacking

In [9]:
# Step 1: Remap the class labels from [-1, 0, 1] to [2, 0, 1]
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

# Step 2: Remove rows where the target labels (category) are NaN
df = df.dropna(subset=['category'])

# Trigram setting and max_features setting for the TF-IDF Vectorizer
ngram_range = (1, 3)
max_features = 1000

#  Step 3: Train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, stratify = df['category'], random_state=42)

#  Step 4: Vectorization using TF-IDF, fit on training data only
vectorizer = TfidfVectorizer(ngram_range= ngram_range, max_features= max_features)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Step 5: Handle class imbalance using SMOTE (Oversampling)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Step 6: Function to log results in Mlflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model type and trial number
        mlflow.set_tag("mlflow.runName", f"{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

        return accuracy





In [10]:
# Step 7: Stacking Model
def stack_model():
  # Base learners
  lightgbm_model = LGBMClassifier(
      objective='multiclass',
      num_class=3,
      metric="multi_logloss",
      is_unbalance=True,
      class_weight="balanced",
      reg_alpha=0.1,  # L1 regularization
      reg_lambda=0.1,  # L2 regularization,
      learning_rate=0.08081298097796712,
      n_estimators=367,
      max_depth=20
  )

  logreg_model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs', multi_class='multinomial')

  # Meta-learner
  knn_meta_learner = KNeighborsClassifier(n_neighbors=5)

  # Create the StackingClassifier with LightGBM and LogisticRegression as base models, and KNN as meta-learner
  stacking_model = StackingClassifier(
      estimators=[
          ('lightgbm', lightgbm_model),
          ('logistic_regression', logreg_model)
      ],
      final_estimator=knn_meta_learner,
      cv=5
  )

  # Log MLflow
  accuracy = log_mlflow("StackingClassifier", stacking_model, X_train, X_test, y_train, y_test)

  return accuracy

In [11]:
# run stack experiment
stack_model()


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.193851 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64039
[LightGBM] [Info] Number of data points in the train set: 25064, number of used features: 970
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.180549 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 52976
[LightGBM] [Info] Number of data points in the train set: 20051, number of used features: 968
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.143606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 52997
[LightGBM] [Info] Number of data points in the train set: 20051, number of used features: 966
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.155661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53155
[LightGBM] [Info] Number of data points in the train set: 20051, number of used features: 966
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.153095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53233
[LightGBM] [Info] Number of data points in the train set: 20051, number of used features: 966
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.138787 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 52884
[LightGBM] [Info] Number of data points in the train set: 20052, number of used features: 968
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -34.538776




🏃 View run StackingClassifier_SMOTE_TFIDF_Trigrams at: http://ec2-3-253-113-244.eu-west-1.compute.amazonaws.com:5000/#/experiments/287934960029108411/runs/fe18578072db4878925922d30b943296
🧪 View experiment at: http://ec2-3-253-113-244.eu-west-1.compute.amazonaws.com:5000/#/experiments/287934960029108411


0.8989124621144589