In [21]:
!pip install mlflow



In [22]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score

In [23]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Suppress specific warnings (e.g., DeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [24]:
import pandas as pd

train_data = pd.read_csv('/content/train_data.csv')
test_data = pd.read_csv('/content/test_data.csv')


In [25]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

## Loading data and Vectorizing it

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Convert text to Bag of Words (BoW) representation
vectorizer = CountVectorizer(analyzer=lambda x: x.split())
train_bow = vectorizer.fit_transform(train_data['processed msg'])

# Transform BoW into TF-IDF representation
tfidf_converter = TfidfTransformer()
train_tfidf = tfidf_converter.fit_transform(train_bow)

test_bow = vectorizer.transform(test_data['processed msg'])
test_tfidf = tfidf_converter.transform(test_bow)

y_train = train_data.Label.map({'ham': 0, 'spam': 1}).values
y_test = test_data['Label'].map({'ham': 0, 'spam': 1}).values


## Starting ML flow

In [27]:
mlflow.set_experiment("spam_detection")

# Function to train and log models
def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test):
    with mlflow.start_run():
        model.fit(X_train, y_train)
        y_pred_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
        aucpr = average_precision_score(y_test, y_pred_prob)

        # Log model parameters
        mlflow.log_params(model.get_params())

        # Log AUCPR metric
        mlflow.log_metric("AUCPR", aucpr)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

        print(f"{model_name} AUCPR: {aucpr:.4f}")

        return model, aucpr

In [28]:
# Taking the same models from last
rf = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200)
svm = SVC(C=1, gamma='scale', kernel='linear', probability=True)
lr = LogisticRegression(C=20, penalty='l2', solver='saga')

In [29]:
# Train and log models
rf_model, rf_aucpr = train_and_log_model(rf, "RandomForest", train_tfidf, y_train, test_tfidf, y_test)
svm_model, svm_aucpr = train_and_log_model(svm, "SVM", train_tfidf, y_train, test_tfidf, y_test)
lr_model, lr_aucpr = train_and_log_model(lr, "LogisticRegression", train_tfidf, y_train, test_tfidf, y_test)



RandomForest AUCPR: 0.9642




SVM AUCPR: 0.9662




LogisticRegression AUCPR: 0.9682


In [30]:
# Print results
print("\nModel Selection Metrics (AUCPR):")
print(f"Random Forest: {rf_aucpr:.4f}")
print(f"SVM: {svm_aucpr:.4f}")
print(f"Logistic Regression: {lr_aucpr:.4f}")


Model Selection Metrics (AUCPR):
Random Forest: 0.9642
SVM: 0.9662
Logistic Regression: 0.9682
