In [None]:
import os
os.system("pip install -q dagshub mlflow")

0

In [18]:
import mlflow
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri("https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow")
client = MlflowClient()
default_experiment = client.get_experiment_by_name("Fake Review Detection")
runs = client.search_runs(experiment_ids=[default_experiment.experiment_id])
for run in runs:
    client.delete_run(run.info.run_id)
    print(f"Deleted run {run.info.run_id} from experiment {default_experiment.name}")


Deleted run fa710c3a46af46968df305121281663e from experiment Fake Review Detection


In [None]:
import os
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
import mlflow
import mlflow.sklearn
import mlflow.keras
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

mlflow.set_tracking_uri("https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow")
mlflow.set_experiment("Fake Review Detection")

files = [
    "../Data/Feature-Engineered/preprocessed_lemmatization_features.csv",
    "../Data/Feature-Engineered/preprocessed_no_stopwords_features.csv",
    "../Data/Feature-Engineered/preprocessed_no_stopwords_no_lemmatization_features.csv",
    "../Data/Feature-Engineered/preprocessed_stemming_features.csv",
    "../Data/Feature-Engineered/preprocessed_stemming_no_stopwords_features.csv"
]

embeddings = {
    "Tfidf": TfidfVectorizer(max_features=5000),
    "Count": CountVectorizer(max_features=5000)
}

models = {
    "LogisticRegression": (LogisticRegression, {"C": [0.01, 0.1, 1, 10], "solver": ["liblinear", "lbfgs"], "max_iter": [100, 200, 500]}),
    "RandomForest": (RandomForestClassifier, {"n_estimators": [50, 100], "max_depth": [None, 10, 20]}),
    "SVC": (SVC, {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]})
}

results_list = []

for f in files:
    df = pd.read_csv(f)
    df.dropna(inplace=True)
    le = LabelEncoder()
    df["label_encoded"] = le.fit_transform(df["label"])
    text_features = df["processed_text"]
    numeric_features = df[["lexical_diversity", "avg_word_length", "sentiment_polarity", "subjectivity", "flesch_reading_ease", "sentence_length", "named_entity_count", "noun_count", "verb_count", "adj_count", "adv_count"]]
    y = df["label_encoded"]
    scaler = StandardScaler()
    X_numeric = scaler.fit_transform(numeric_features)
    for emb_name, vectorizer in embeddings.items():
        X_text = vectorizer.fit_transform(text_features).toarray()
        X = np.hstack((X_text, X_numeric)).astype(np.float32)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        for model_name, (model_class, param_grid) in models.items():
            run_name = os.path.basename(f) + "_" + emb_name + "_" + model_name
            with mlflow.start_run(run_name=run_name):
                mlflow.log_param("file_name", f)
                mlflow.log_param("embedding", emb_name)
                mlflow.log_param("model", model_name)
                grid_search = GridSearchCV(model_class(), param_grid, cv=5, scoring="accuracy", n_jobs=1)
                grid_search.fit(X_train, y_train)
                best_model = grid_search.best_estimator_
                preds = best_model.predict(X_test)
                acc = accuracy_score(y_test, preds)
                prec = precision_score(y_test, preds, average="weighted")
                rec = recall_score(y_test, preds, average="weighted")
                f1 = f1_score(y_test, preds, average="weighted")
                mlflow.log_params(grid_search.best_params_)
                mlflow.log_metric("accuracy", acc)
                mlflow.log_metric("precision", prec)
                mlflow.log_metric("recall", rec)
                mlflow.log_metric("f1_score", f1)
                mlflow.sklearn.log_model(best_model, model_name + "_Model")
                cm = confusion_matrix(y_test, preds)
                plt.figure(figsize=(7, 5))
                sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
                plt.xlabel("Predicted")
                plt.ylabel("Actual")
                cm_file = "../Reports/confusion_matrix_" + run_name + ".png"
                plt.savefig(cm_file)
                mlflow.log_artifact(cm_file)
                plt.close()
                results_list.append([os.path.basename(f), emb_name, model_name, acc, prec, rec, f1])

dl_results = []
for f in files:
    df = pd.read_csv(f)
    df.dropna(inplace=True)
    le = LabelEncoder()
    df["label_encoded"] = le.fit_transform(df["label"])
    text_features = df["processed_text"]
    y = df["label_encoded"].values
    num_classes = len(np.unique(y))
    vocab_size = 10000
    max_length = 200
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(text_features)
    sequences = tokenizer.texts_to_sequences(text_features)
    padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")
    X = padded
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = Sequential([
        Embedding(vocab_size, 128, input_length=max_length),
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        Dense(num_classes, activation="softmax")
    ])
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=0)
    loss, acc = model.evaluate(X_test, y_test, verbose=0)
    preds_prob = model.predict(X_test)
    preds = preds_prob.argmax(axis=1)
    prec = precision_score(y_test, preds, average="weighted")
    rec = recall_score(y_test, preds, average="weighted")
    f1 = f1_score(y_test, preds, average="weighted")
    run_name = "DL_LSTM_" + os.path.basename(f)
    with mlflow.start_run(run_name=run_name):
        mlflow.log_param("file_name", f)
        mlflow.log_param("model", "LSTM")
        mlflow.log_param("embedding", "Tokenizer+Embedding")
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1_score", f1)
        mlflow.keras.log_model(model, "LSTM_Model")
        cm = confusion_matrix(y_test, preds)
        plt.figure(figsize=(7, 5))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        cm_file = "../Reports/confusion_matrix_DL_" + os.path.basename(f) + ".png"
        plt.savefig(cm_file)
        mlflow.log_artifact(cm_file)
        plt.close()
        dl_results.append([os.path.basename(f), "LSTM", "DL", acc, prec, rec, f1])

all_results = results_list + dl_results
results_df = pd.DataFrame(all_results, columns=["File", "Embedding/ModelType", "Model", "Accuracy", "Precision", "Recall", "F1"])




🏃 View run preprocessed_lemmatization_features.csv at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1/runs/3e917cd4a4634697878c7d9ea88e5601
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1




🏃 View run preprocessed_no_stopwords_features.csv at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1/runs/98b0681cda65417eb2d4165f768f3bee
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/1
