In [23]:
!pip install mlflow





[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score

In [25]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Suppress specific warnings (e.g., DeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('processed_data.csv')
data.dropna(inplace=True)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


## Loading data and Vectorizing it

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pickle

# Define a named function instead of lambda
from auxilary import simple_split

# Use the named function in CountVectorizer
vectorizer = CountVectorizer(analyzer=simple_split)

train_bow = vectorizer.fit_transform(train_data['processed msg'])

# Transform BoW into TF-IDF representation
tfidf_converter = TfidfTransformer()
train_tfidf = tfidf_converter.fit_transform(train_bow)

test_bow = vectorizer.transform(test_data['processed msg'])
test_tfidf = tfidf_converter.transform(test_bow)

y_train = train_data.Label.map({'ham': 0, 'spam': 1}).values
y_test = test_data['Label'].map({'ham': 0, 'spam': 1}).values


with open("custom_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

## Starting ML flow

In [22]:
mlflow.set_experiment("spam_detection")

# Function to train and log models
def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test):
    with mlflow.start_run():
        model.fit(X_train, y_train)
        y_pred_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
        aucpr = average_precision_score(y_test, y_pred_prob)

        # Log model parameters
        mlflow.log_params(model.get_params())

        # Log AUCPR metric
        mlflow.log_metric("AUCPR", aucpr)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

        print(f"{model_name} AUCPR: {aucpr:.4f}")

        return model, aucpr

In [16]:
# Taking the same models from last
rf = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200)
svm = SVC(C=1, gamma='scale', kernel='linear', probability=True)
lr = LogisticRegression(C=20, penalty='l2', solver='saga')

In [17]:
# Train and log models
rf_model, rf_aucpr = train_and_log_model(rf, "RandomForest", train_tfidf, y_train, test_tfidf, y_test)
svm_model, svm_aucpr = train_and_log_model(svm, "SVM", train_tfidf, y_train, test_tfidf, y_test)
lr_model, lr_aucpr = train_and_log_model(lr, "LogisticRegression", train_tfidf, y_train, test_tfidf, y_test)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



RandomForest AUCPR: 0.9698




SVM AUCPR: 0.9751




LogisticRegression AUCPR: 0.9724


In [20]:
import pickle

with open('svm.pkl', 'wb') as f:
    pickle.dump(svm_model, f)

In [18]:
# Print results
print("\nModel Selection Metrics (AUCPR):")
print(f"Random Forest: {rf_aucpr:.4f}")
print(f"SVM: {svm_aucpr:.4f}")
print(f"Logistic Regression: {lr_aucpr:.4f}")


Model Selection Metrics (AUCPR):
Random Forest: 0.9698
SVM: 0.9751
Logistic Regression: 0.9724
