In [None]:
import os
os.system("pip install -q dagshub mlflow")

0

In [6]:
import mlflow
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri("https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow")
client = MlflowClient()
default_experiment = client.get_experiment_by_name("Fake Review Detection")
runs = client.search_runs(experiment_ids=[default_experiment.experiment_id])
for run in runs:
    try:
        client.delete_run(run.info.run_id)
        print(f"Deleted run {run.info.run_id} from experiment {default_experiment.name}")
    except Exception as e:
        print(f"Could not delete run {run.info.run_id}: {e}")


Could not delete run 2d0257df3f4f44ec8e7115bf59e61c85: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 51a8715a37c444a19ef3a9fc0018c6b0: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 4a870c3ae9d646909c750b4fd09c184a: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 3f12b79a5b6c46d1acb31d5b271370ab: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 9f603c8dd2874928b176cb3afcede509: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run 94877f52e89442e0b16c0ace287ee091: API request to endpoint /api/2.0/mlflow/runs/delete failed with error code 403 != 200. Response body: ''
Could not delete run db45689c2a8c4640aea

In [None]:
import os
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
import mlflow
import mlflow.sklearn
import mlflow.keras
import mlflow.data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

# Set new experiment name and MLflow tracking parameters
NEW_EXPERIMENT_NAME = "Fake Review Detection 2.0"
os.environ["MLFLOW_TRACKING_USERNAME"] = os.getenv("MLFLOW_TRACKING_USERNAME", "malhar.c.prajapati")
os.environ["MLFLOW_TRACKING_PASSWORD"] = os.getenv("MLFLOW_TRACKING_PASSWORD", "f222587ea4fa84ee148e478d207d3112535c5edd")
mlflow.set_tracking_uri("https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow")
mlflow.set_experiment(NEW_EXPERIMENT_NAME)

# Define file lists
feature_files = [
    "../Data/Feature-Engineered/preprocessed_lemmatization_features.csv",
    "../Data/Feature-Engineered/preprocessed_no_stopwords_features.csv",
    "../Data/Feature-Engineered/preprocessed_no_stopwords_no_lemmatization_features.csv",
    "../Data/Feature-Engineered/preprocessed_stemming_features.csv",
    "../Data/Feature-Engineered/preprocessed_stemming_no_stopwords_features.csv"
]

embedding_files = [
    "../../embeddings/preprocessed_lemmatization_bert.csv",
    "../../embeddings/preprocessed_lemmatization_glove.csv",
    "../../embeddings/preprocessed_lemmatization_tfidf.csv",
    "../../embeddings/preprocessed_no_stopwords_bert.csv",
    "../../embeddings/preprocessed_no_stopwords_glove.csv",
    "../../embeddings/preprocessed_no_stopwords_no_lemmatization_bert.csv",
    "../../embeddings/preprocessed_no_stopwords_no_lemmatization_glove.csv",
    "../../embeddings/preprocessed_no_stopwords_no_lemmatization_tfidf.csv",
    "../../embeddings/preprocessed_no_stopwords_tfidf.csv",
    "../../embeddings/preprocessed_stemming_bert.csv",
    "../../embeddings/preprocessed_stemming_glove.csv",
    "../../embeddings/preprocessed_stemming_no_stopwords_bert.csv",
    "../../embeddings/preprocessed_stemming_no_stopwords_glove.csv",
    "../../embeddings/preprocessed_stemming_no_stopwords_tfidf.csv",
    "../../embeddings/preprocessed_stemming_tfidf.csv"
]

files = feature_files + embedding_files

# Define ML models and their parameter grids
models = {
    "LogisticRegression": (LogisticRegression, {"C": [0.1, 1], "solver": ["liblinear"], "max_iter": [100]}),
    "RandomForest": (RandomForestClassifier, {"n_estimators": [50, 100], "max_depth": [None, 10]}),
    "SVC": (SVC, {"C": [0.1, 1], "kernel": ["linear"]})
}

progress_file = "progress_log.csv"
if os.path.exists(progress_file):
    dfp = pd.read_csv(progress_file)
    processed_keys = set(dfp["run_key"].tolist())
else:
    processed_keys = set()

def log_confusion_matrix(y_true, y_pred, run_key, prefix):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    cm_path = f"../Reports/confusion_matrix_{prefix}_{run_key}.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

def log_dataset(df, source_file):
    try:
        ds = mlflow.data.from_pandas(df, source=source_file)
        mlflow.data.log_dataset(ds, name="embedding_data")
    except Exception:
        mlflow.log_artifact(source_file, artifact_path="dataset_csv")

# ------------------------------
# Traditional ML Experiments (applied to all files)
# ------------------------------
for f in files:
    if not os.path.exists(f):
        continue
    df = pd.read_csv(f)
    if "label" not in df.columns:
        continue
    df.dropna(inplace=True)
    y = df["label"].values
    if y.dtype == object or y.dtype == "O":
        le = LabelEncoder()
        y = le.fit_transform(y)
    # For feature files with computed metrics, use numeric columns; otherwise use all columns except label.
    if f in feature_files and "processed_text" in df.columns and "lexical_diversity" in df.columns:
        numeric_cols = ["lexical_diversity", "avg_word_length", "sentiment_polarity",
                        "subjectivity", "flesch_reading_ease", "sentence_length",
                        "named_entity_count", "noun_count", "verb_count", "adj_count", "adv_count"]
        available_cols = [col for col in numeric_cols if col in df.columns]
        X = df[available_cols].values
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    else:
        X = df.drop(columns=["label"]).values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    name_prefix = os.path.basename(f)
    for m_name, (Cls, param_grid) in models.items():
        run_key = f"{name_prefix}_{m_name}_ML"
        if run_key in processed_keys:
            continue
        with mlflow.start_run(run_name=run_key):
            mlflow.log_param("file_name", f)
            mlflow.log_param("model_type", m_name)
            log_dataset(df, f)
            gs = GridSearchCV(Cls(), param_grid, cv=3, scoring="accuracy", n_jobs=1)
            gs.fit(X_train, y_train)
            best_model = gs.best_estimator_
            preds = best_model.predict(X_test)
            acc = accuracy_score(y_test, preds)
            prec = precision_score(y_test, preds, average="weighted")
            rec = recall_score(y_test, preds, average="weighted")
            f1 = f1_score(y_test, preds, average="weighted")
            mlflow.log_params(gs.best_params_)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("precision", prec)
            mlflow.log_metric("recall", rec)
            mlflow.log_metric("f1_score", f1)
            mlflow.sklearn.log_model(best_model, f"{m_name}_Model")
            log_confusion_matrix(y_test, preds, run_key, "ML")
        mlflow.end_run()
        new_row = [run_key, f, m_name, acc, prec, rec, f1]
        dfp_new = pd.DataFrame([new_row], columns=["run_key", "File", "Model", "Accuracy", "Precision", "Recall", "F1"])
        if os.path.exists(progress_file):
            dfp_new.to_csv(progress_file, mode='a', index=False, header=False)
        else:
            dfp_new.to_csv(progress_file, index=False)
        processed_keys.add(run_key)

# ------------------------------
# DL Experiments for Feature Files (using tokenization)
# ------------------------------
for f in feature_files:
    if not os.path.exists(f):
        continue
    df = pd.read_csv(f)
    if "label" not in df.columns or "processed_text" not in df.columns:
        continue
    df.dropna(inplace=True)
    y = df["label"].values
    if y.dtype == object or y.dtype == "O":
        le = LabelEncoder()
        y = le.fit_transform(y)
    texts = df["processed_text"].fillna("").astype(str).tolist()
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    vocab_size = 10000
    max_length = 200
    tokenizer_obj = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer_obj.fit_on_texts(texts)
    sequences = tokenizer_obj.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")
    X_text = padded
    X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_text, y, test_size=0.2, random_state=42)
    num_classes = len(np.unique(y))
    
    # DL Experiment 1: 2-Layer LSTM
    dl_run_key = f"DL_2LSTM_{name_prefix}"
    if dl_run_key not in processed_keys:
        with mlflow.start_run(run_name=dl_run_key):
            mlflow.log_param("file_name", f)
            mlflow.log_param("model_type", "2-Layer LSTM")
            log_dataset(df, f)
            model_dl = Sequential()
            model_dl.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
            model_dl.add(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
            model_dl.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
            model_dl.add(Dense(64, activation="relu"))
            model_dl.add(Dropout(0.2))
            model_dl.add(Dense(num_classes, activation="softmax"))
            model_dl.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
            model_dl.fit(X_train_dl, y_train_dl, epochs=3, batch_size=32, validation_split=0.1, verbose=0)
            loss_dl, acc_dl = model_dl.evaluate(X_test_dl, y_test_dl, verbose=0)
            preds_prob_dl = model_dl.predict(X_test_dl)
            preds_dl = preds_prob_dl.argmax(axis=1)
            prec_dl = precision_score(y_test_dl, preds_dl, average="weighted")
            rec_dl = recall_score(y_test_dl, preds_dl, average="weighted")
            f1_dl = f1_score(y_test_dl, preds_dl, average="weighted")
            mlflow.log_metric("accuracy", acc_dl)
            mlflow.log_metric("precision", prec_dl)
            mlflow.log_metric("recall", rec_dl)
            mlflow.log_metric("f1_score", f1_dl)
            mlflow.keras.log_model(model_dl, "2LSTM_Model")
            log_confusion_matrix(y_test_dl, preds_dl, dl_run_key, "DL")
        mlflow.end_run()
        new_row = [dl_run_key, f, "2-Layer LSTM", acc_dl, prec_dl, rec_dl, f1_dl]
        dfp_new = pd.DataFrame([new_row], columns=["run_key", "File", "Model", "Accuracy", "Precision", "Recall", "F1"])
        if os.path.exists(progress_file):
            dfp_new.to_csv(progress_file, mode='a', index=False, header=False)
        else:
            dfp_new.to_csv(progress_file, index=False)
        processed_keys.add(dl_run_key)
    
    # DL Experiment 2: Bidirectional LSTM
    dl_run_key_bi = f"DL_BiLSTM_{name_prefix}"
    if dl_run_key_bi not in processed_keys:
        with mlflow.start_run(run_name=dl_run_key_bi):
            mlflow.log_param("file_name", f)
            mlflow.log_param("model_type", "Bidirectional LSTM")
            log_dataset(df, f)
            model_bi = Sequential()
            model_bi.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
            model_bi.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
            model_bi.add(Dense(64, activation="relu"))
            model_bi.add(Dropout(0.2))
            model_bi.add(Dense(num_classes, activation="softmax"))
            model_bi.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
            model_bi.fit(X_train_dl, y_train_dl, epochs=3, batch_size=32, validation_split=0.1, verbose=0)
            loss_bi, acc_bi = model_bi.evaluate(X_test_dl, y_test_dl, verbose=0)
            preds_prob_bi = model_bi.predict(X_test_dl)
            preds_bi = preds_prob_bi.argmax(axis=1)
            prec_bi = precision_score(y_test_dl, preds_bi, average="weighted")
            rec_bi = recall_score(y_test_dl, preds_bi, average="weighted")
            f1_bi = f1_score(y_test_dl, preds_bi, average="weighted")
            mlflow.log_metric("accuracy", acc_bi)
            mlflow.log_metric("precision", prec_bi)
            mlflow.log_metric("recall", rec_bi)
            mlflow.log_metric("f1_score", f1_bi)
            mlflow.keras.log_model(model_bi, "BiLSTM_Model")
            log_confusion_matrix(y_test_dl, preds_bi, dl_run_key_bi, "DL")
        mlflow.end_run()
        new_row = [dl_run_key_bi, f, "Bidirectional LSTM", acc_bi, prec_bi, rec_bi, f1_bi]
        dfp_new = pd.DataFrame([new_row], columns=["run_key", "File", "Model", "Accuracy", "Precision", "Recall", "F1"])
        if os.path.exists(progress_file):
            dfp_new.to_csv(progress_file, mode='a', index=False, header=False)
        else:
            dfp_new.to_csv(progress_file, index=False)
        processed_keys.add(dl_run_key_bi)

# ------------------------------
# DL Experiments for Embedding Files (using precomputed embeddings)
# ------------------------------
for f in embedding_files:
    if not os.path.exists(f):
        continue
    df = pd.read_csv(f)
    if "label" not in df.columns:
        continue
    df.dropna(inplace=True)
    y = df["label"].values
    if y.dtype == object or y.dtype == "O":
        le = LabelEncoder()
        y = le.fit_transform(y)
    # Assume precomputed embeddings are all numeric features except the label.
    X = df.drop(columns=["label"]).values
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(X, y, test_size=0.2, random_state=42)
    name_prefix = os.path.basename(f)
    dl_run_key_dense = f"DL_Dense_{name_prefix}"
    if dl_run_key_dense not in processed_keys:
        with mlflow.start_run(run_name=dl_run_key_dense):
            mlflow.log_param("file_name", f)
            mlflow.log_param("model_type", "Dense NN on Embeddings")
            log_dataset(df, f)
            input_dim = X.shape[1]
            model_dense = Sequential()
            model_dense.add(Dense(128, activation="relu", input_dim=input_dim))
            model_dense.add(Dropout(0.2))
            model_dense.add(Dense(64, activation="relu"))
            model_dense.add(Dropout(0.2))
            model_dense.add(Dense(len(np.unique(y)), activation="softmax"))
            model_dense.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
            model_dense.fit(X_train_e, y_train_e, epochs=3, batch_size=32, validation_split=0.1, verbose=0)
            loss_dense, acc_dense = model_dense.evaluate(X_test_e, y_test_e, verbose=0)
            preds_dense = model_dense.predict(X_test_e).argmax(axis=1)
            prec_dense = precision_score(y_test_e, preds_dense, average="weighted")
            rec_dense = recall_score(y_test_e, preds_dense, average="weighted")
            f1_dense = f1_score(y_test_e, preds_dense, average="weighted")
            mlflow.log_metric("accuracy", acc_dense)
            mlflow.log_metric("precision", prec_dense)
            mlflow.log_metric("recall", rec_dense)
            mlflow.log_metric("f1_score", f1_dense)
            mlflow.keras.log_model(model_dense, "DenseNN_Model")
            log_confusion_matrix(y_test_e, preds_dense, dl_run_key_dense, "DL")
        mlflow.end_run()
        new_row = [dl_run_key_dense, f, "Dense NN on Embeddings", acc_dense, prec_dense, rec_dense, f1_dense]
        dfp_new = pd.DataFrame([new_row], columns=["run_key", "File", "Model", "Accuracy", "Precision", "Recall", "F1"])
        if os.path.exists(progress_file):
            dfp_new.to_csv(progress_file, mode='a', index=False, header=False)
        else:
            dfp_new.to_csv(progress_file, index=False)
        processed_keys.add(dl_run_key_dense)

print("All experiments completed.")




🏃 View run preprocessed_lemmatization_features.csv_Tfidf_LogisticRegression at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1/runs/4a870c3ae9d646909c750b4fd09c184a
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1




🏃 View run preprocessed_lemmatization_features.csv_Tfidf_RandomForest at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1/runs/51a8715a37c444a19ef3a9fc0018c6b0
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1
