In [0]:
spark.conf.set(
    "fs.azure.account.key.toxicitylake7032.dfs.core.windows.net",
    "aint no way im pushing a key"
)

In [0]:

from pyspark.sql import functions as F

gold_path = "abfss://lakehouse@toxicitylake7032.dfs.core.windows.net/gold/reddit_features/"

df_gold = spark.read.parquet(gold_path)
df_gold.printSchema()
df_gold.show(5, truncate=False)

root
 |-- post_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- parent_user_id: string (nullable = true)
 |-- platform: string (nullable = true)
 |-- community: string (nullable = true)
 |-- interaction_type: string (nullable = true)
 |-- publish_ts: timestamp (nullable = true)
 |-- publish_date_only: date (nullable = true)
 |-- content: string (nullable = true)
 |-- url: string (nullable = true)
 |-- sentiment_vader: double (nullable = true)
 |-- sentiment_textblob: double (nullable = true)
 |-- subjectivity_textblob: double (nullable = true)
 |-- toxicity_toxigen: double (nullable = true)
 |-- strict_filter: integer (nullable = true)
 |-- content_length_chars: integer (nullable = true)
 |-- content_length_words: integer (nullable = true)
 |-- toxicity_label: integer (nullable = true)
 |-- sentiment_bucket: string (nullable = true)
 |-- risk_level: string (nullable = true)

+------------------------------------+---------

In [0]:
# ----------------------------------------
# 2) Select training columns + sample
# ----------------------------------------
df_train = df_gold.select(
    "content",
    "content_length_words",
    "content_length_chars",
    "sentiment_vader",
    "sentiment_textblob",
    "subjectivity_textblob",
    "toxicity_label"
).dropna()

# SAMPLE to speed up training (30% of data)
df_train = df_train.sample(withReplacement=False, fraction=0.3, seed=42)

print("Sampled row count:", df_train.count())
df_train.show(5, truncate=False)

Sampled row count: 2821948
+-----------------------------------------------------------------------+--------------------+--------------------+---------------+--------------------+---------------------+--------------+
|content                                                                |content_length_words|content_length_chars|sentiment_vader|sentiment_textblob  |subjectivity_textblob|toxicity_label|
+-----------------------------------------------------------------------+--------------------+--------------------+---------------+--------------------+---------------------+--------------+
|First post ( Õ°¬∞ Õú ñ Õ°¬∞)                                                 |6                   |22                  |0.0            |0.25                |0.3333333333333333   |0             |
|Star Trek / Doctor Who Mashup...                                       |6                   |32                  |0.0            |0.0                 |0.0                  |0             |
|"It's much easie

In [0]:
# ----------------------------------------
# 3) Convert to pandas
# ----------------------------------------
import pandas as pd

pdf = df_train.toPandas()
print("Pandas shape:", pdf.shape)

Pandas shape: (2821948, 7)


In [0]:
# ----------------------------------------
# 4) Build sklearn pipeline
# ----------------------------------------
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

text_features = "content"
numeric_features = [
    "content_length_words",
    "content_length_chars",
    "sentiment_vader",
    "sentiment_textblob",
    "subjectivity_textblob"
]

preprocess = ColumnTransformer(
    transformers=[
        (
            "text",
            TfidfVectorizer(
                max_features=20000,   # unigrams only for speed
                ngram_range=(1, 1),
                stop_words="english"
            ),
            text_features
        ),
        ("num", StandardScaler(), numeric_features)
    ]
)

model = Pipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=2000))
])

In [0]:
# ----------------------------------------
# 5) Train/test split
# ----------------------------------------
from sklearn.model_selection import train_test_split

X = pdf  # ColumnTransformer will pick the right columns by name
y = pdf["toxicity_label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # keeps class balance
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (2257558, 7) Test size: (564390, 7)


In [0]:
# ----------------------------------------
# 6) Train model + evaluate metrics
# ----------------------------------------
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    classification_report
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # assumes binary 0/1 label

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="binary")
precision = precision_score(y_test, y_pred, average="binary")
recall = recall_score(y_test, y_pred, average="binary")

try:
    roc_auc = roc_auc_score(y_test, y_proba)
except ValueError:
    roc_auc = None  # if only one class present in y_test

print("=== Evaluation on Test Set ===")
print("Accuracy :", accuracy)
print("F1       :", f1)
print("Precision:", precision)
print("Recall   :", recall)
print("ROC AUC  :", roc_auc)
print("\nClassification report:")
print(classification_report(y_test, y_pred))


üèÉ View run intelligent-asp-514 at: https://adb-3470248681765266.6.azuredatabricks.net/ml/experiments/439462251615551/runs/3a52dc982727498c93d4078e130dec09
üß™ View experiment at: https://adb-3470248681765266.6.azuredatabricks.net/ml/experiments/439462251615551
=== Evaluation on Test Set ===
Accuracy : 0.7922801608816599
F1       : 0.42406229274643215
Precision: 0.6607066315596105
Recall   : 0.3122309756856277
ROC AUC  : 0.7730654979253081

Classification report:
              precision    recall  f1-score   support

           0       0.81      0.95      0.87    426159
           1       0.66      0.31      0.42    138231

    accuracy                           0.79    564390
   macro avg       0.74      0.63      0.65    564390
weighted avg       0.77      0.79      0.76    564390



In [0]:
# ----------------------------------------
# 7) Log everything to MLflow
# ----------------------------------------
import mlflow
import mlflow.sklearn

# Optional: auto-log (parameters, metrics, model, etc.)
# mlflow.sklearn.autolog()

# Optional: set a specific experiment
# mlflow.set_experiment("/Users/<your_user>/reddit_toxicity_experiment")

with mlflow.start_run(run_name="reddit_toxicity_model"):
    # If you didn't call autolog(), you should manually log metrics/model:
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    if roc_auc is not None:
        mlflow.log_metric("roc_auc", roc_auc)

    # You can also log some basic params:
    mlflow.log_param("tfidf_max_features", 20000)
    mlflow.log_param("clf", "LogisticRegression")
    mlflow.log_param("test_size", 0.2)

    # Log the trained sklearn pipeline as a model artifact
    mlflow.sklearn.log_model(model, artifact_path="model")

    run_id = mlflow.active_run().info.run_id

print("\nLogged run to MLflow with run_id:", run_id)



üèÉ View run reddit_toxicity_model at: https://adb-3470248681765266.6.azuredatabricks.net/ml/experiments/439462251615551/runs/a0d07d615b264f39ad0a8a2a91a06e7f
üß™ View experiment at: https://adb-3470248681765266.6.azuredatabricks.net/ml/experiments/439462251615551

Logged run to MLflow with run_id: a0d07d615b264f39ad0a8a2a91a06e7f
