In [5]:
import os
os.system("pip install -q dagshub mlflow")


0

In [1]:
import os
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import dagshub
%pip install -q dagshub mlflow

# Initialize the DagsHub repository
dagshub.init(repo_owner='malhar.c.prajapati', repo_name='my-first-repo', mlflow=True)

# Load and preprocess the dataset
data_path = "../Data/Feature-Engineered/preprocessed_lemmatization_features.csv"
df = pd.read_csv(data_path)
df.dropna(inplace=True)

label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

text_features = df['processed_text']
numeric_features = df[['lexical_diversity', 'avg_word_length', 'sentiment_polarity',
                       'subjectivity', 'flesch_reading_ease', 'sentence_length', 'named_entity_count',
                       'noun_count', 'verb_count', 'adj_count', 'adv_count']]
y = df['label_encoded']

# Vectorization and scaling
vectorizer = TfidfVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(text_features).toarray()

scaler = StandardScaler()
X_numeric = scaler.fit_transform(numeric_features)

X = np.hstack((X_text, X_numeric))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def train_lr_model():
    with mlflow.start_run(run_name="LogisticRegression"):
        lr_model = LogisticRegression(max_iter=200)
        lr_model.fit(X_train, y_train)
        preds = lr_model.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        
        # Log metrics and model
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(lr_model, "LogisticRegression_Model")
        
        print(f"Logistic Regression Accuracy: {accuracy:.4f}")
        
        lr_cm = confusion_matrix(y_test, preds)
        
        # Save and log the confusion matrix as an artifact
        cm_file = "confusion_matrix.png"
        plt.figure(figsize=(7, 5))
        sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Logistic Regression Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig(cm_file)
        mlflow.log_artifact(cm_file)  
        plt.close()
        
        return lr_model, accuracy, lr_cm

lr_model, lr_acc, lr_cm = train_lr_model()

# Save the results in a DataFrame and store it
results = {
    "LogisticRegression": lr_acc
}
results_df = pd.DataFrame(results.items(), columns=["Model", "Accuracy"])
results_df.to_csv("../Data/experiment_results.csv", index=False)
print("Experiment results saved!")



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.




Logistic Regression Accuracy: 0.9082
🏃 View run LogisticRegression at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/0/runs/b1ecb5b1563042cd8b266a84dc81330e
🧪 View experiment at: https://dagshub.com/malhar.c.prajapati/my-first-repo.mlflow/#/experiments/0
Experiment results saved!
