In [None]:
import os
os.system("pip install -q dagshub mlflow")

0

In [6]:
import mlflow
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri("https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow")
client = MlflowClient()
default_experiment = client.get_experiment_by_name("Fake Review Detection")
runs = client.search_runs(experiment_ids=[default_experiment.experiment_id])
for run in runs:
    try:
        client.delete_run(run.info.run_id)
        print(f"Deleted run {run.info.run_id} from experiment {default_experiment.name}")
    except Exception as e:
        print(f"Could not delete run {run.info.run_id}: {e}")


Could not delete run 2d0257df3f4f44ec8e7115bf59e61c85: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 51a8715a37c444a19ef3a9fc0018c6b0: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 4a870c3ae9d646909c750b4fd09c184a: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 3f12b79a5b6c46d1acb31d5b271370ab: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 9f603c8dd2874928b176cb3afcede509: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 94877f52e89442e0b16c0ace287ee091: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run db45689c2a8c4640aea

In [None]:
import os
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
import mlflow
import mlflow.sklearn
import mlflow.keras
import mlflow.data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

os.environ["MLFLOW_TRACKING_USERNAME"] = os.getenv("MLFLOW_TRACKING_USERNAME", "malhar.c.prajapati")
os.environ["MLFLOW_TRACKING_PASSWORD"] = os.getenv("MLFLOW_TRACKING_PASSWORD", "f222587ea4fa84ee148e478d207d3112535c5edd")
mlflow.set_tracking_uri("https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow")
mlflow.set_experiment("Fake Review Detection")

feature_files = [
    "../Data/Feature-Engineered/preprocessed_lemmatization_features.csv",
    "../Data/Feature-Engineered/preprocessed_no_stopwords_features.csv",
    "../Data/Feature-Engineered/preprocessed_no_stopwords_no_lemmatization_features.csv",
    "../Data/Feature-Engineered/preprocessed_stemming_features.csv",
    "../Data/Feature-Engineered/preprocessed_stemming_no_stopwords_features.csv"
]

embedding_files = [
    "../../embeddings/preprocessed_lemmatization_bert.csv",
    "../../embeddings/preprocessed_lemmatization_glove.csv",
    "../../embeddings/preprocessed_lemmatization_tfidf.csv",
    "../../embeddings/preprocessed_no_stopwords_bert.csv",
    "../../embeddings/preprocessed_no_stopwords_glove.csv",
    "../../embeddings/preprocessed_no_stopwords_no_lemmatization_bert.csv",
    "../../embeddings/preprocessed_no_stopwords_no_lemmatization_glove.csv",
    "../../embeddings/preprocessed_no_stopwords_no_lemmatization_tfidf.csv",
    "../../embeddings/preprocessed_no_stopwords_tfidf.csv",
    "../../embeddings/preprocessed_stemming_bert.csv",
    "../../embeddings/preprocessed_stemming_glove.csv",
    "../../embeddings/preprocessed_stemming_no_stopwords_bert.csv",
    "../../embeddings/preprocessed_stemming_no_stopwords_glove.csv",
    "../../embeddings/preprocessed_stemming_no_stopwords_tfidf.csv",
    "../../embeddings/preprocessed_stemming_tfidf.csv"
]

files = feature_files + embedding_files

models = {
    "LogisticRegression": (
        LogisticRegression,
        {"C": [0.1, 1], "solver": ["liblinear"], "max_iter": [100]}
    ),
    "RandomForest": (
        RandomForestClassifier,
        {"n_estimators": [50, 100], "max_depth": [None, 10]}
    ),
    "SVC": (
        SVC,
        {"C": [0.1, 1], "kernel": ["linear"]}
    )
}

progress_file = "progress_log.csv"
if os.path.exists(progress_file):
    dfp = pd.read_csv(progress_file)
    processed_keys = set(dfp["run_key"].tolist())
else:
    processed_keys = set()

def log_cm(y_true, y_pred, run_key, prefix):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    cm_path = f"../Reports/confusion_matrix_{prefix}_{run_key}.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

for f in files:
    if not os.path.exists(f):
        continue
    df = pd.read_csv(f)
    if "label" not in df.columns:
        continue
    y = df["label"].values
    if y.dtype == object or y.dtype == "O":
        le = LabelEncoder()
        y = le.fit_transform(y)
    X = df.drop(columns=["label"]).values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    name_prefix = os.path.basename(f)
    for m_name, (Cls, param_grid) in models.items():
        run_key = name_prefix + "_" + m_name
        if run_key in processed_keys:
            continue
        with mlflow.start_run(run_name=run_key):
            mlflow.log_param("file_name", f)
            mlflow.log_param("model_type", m_name)
            try:
                ds = mlflow.data.from_pandas(df, source=f)
                mlflow.data.log_dataset(ds, name="embedding_data")
            except Exception:
                mlflow.log_artifact(f, artifact_path="dataset_csv")
            gs = GridSearchCV(Cls(), param_grid, cv=3, scoring="accuracy", n_jobs=1)
            gs.fit(X_train, y_train)
            best_model = gs.best_estimator_
            p = best_model.predict(X_test)
            a = accuracy_score(y_test, p)
            pr = precision_score(y_test, p, average="weighted")
            r = recall_score(y_test, p, average="weighted")
            f1 = f1_score(y_test, p, average="weighted")
            mlflow.log_params(gs.best_params_)
            mlflow.log_metric("accuracy", a)
            mlflow.log_metric("precision", pr)
            mlflow.log_metric("recall", r)
            mlflow.log_metric("f1_score", f1)
            mlflow.sklearn.log_model(best_model, m_name + "_Model")
            log_cm(y_test, p, run_key, "ML")
        mlflow.end_run()
        new_row = [run_key, f, m_name, a, pr, r, f1]
        dfp_new = pd.DataFrame([new_row], columns=["run_key", "File", "Model", "Accuracy", "Precision", "Recall", "F1"])
        if os.path.exists(progress_file):
            dfp_new.to_csv(progress_file, mode='a', index=False, header=False)
        else:
            dfp_new.to_csv(progress_file, index=False)
        processed_keys.add(run_key)
    dl_run_key = "DL_2LSTM_" + name_prefix
    if dl_run_key in processed_keys:
        continue
    num_classes = len(np.unique(y))
    with mlflow.start_run(run_name=dl_run_key):
        mlflow.log_param("file_name", f)
        mlflow.log_param("model_type", "2-Layer LSTM")
        try:
            ds = mlflow.data.from_pandas(df, source=f)
            mlflow.data.log_dataset(ds, name="embedding_data")
        except Exception:
            mlflow.log_artifact(f, artifact_path="dataset_csv")
        model = Sequential()
        model.add(Embedding(input_dim=X.shape[1], output_dim=128, input_length=X.shape[1]))
        model.add(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
        model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(64, activation="relu"))
        model.add(Dropout(0.2))
        model.add(Dense(num_classes, activation="softmax"))
        model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
        model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1, verbose=0)
        loss, acc = model.evaluate(X_test, y_test, verbose=0)
        preds_prob = model.predict(X_test)
        preds = preds_prob.argmax(axis=1)
        prec = precision_score(y_test, preds, average="weighted")
        rec = recall_score(y_test, preds, average="weighted")
        f1 = f1_score(y_test, preds, average="weighted")
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1_score", f1)
        mlflow.keras.log_model(model, "2LSTM_Model")
        log_cm(y_test, preds, dl_run_key, "DL")
    mlflow.end_run()
    new_row = [dl_run_key, f, "2-Layer LSTM", acc, prec, rec, f1]
    dfp_new = pd.DataFrame([new_row], columns=["run_key", "File", "Model", "Accuracy", "Precision", "Recall", "F1"])
    if os.path.exists(progress_file):
        dfp_new.to_csv(progress_file, mode='a', index=False, header=False)
    else:
        dfp_new.to_csv(progress_file, index=False)
    processed_keys.add(dl_run_key)




🏃 View run preprocessed_lemmatization_features.csv_Tfidf_LogisticRegression at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1/runs/4a870c3ae9d646909c750b4fd09c184a
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1




🏃 View run preprocessed_lemmatization_features.csv_Tfidf_RandomForest at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1/runs/51a8715a37c444a19ef3a9fc0018c6b0
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1
