# Model version control and experiment tracking

In [24]:
# Import libraries
import csv
import mlflow
import mlflow.sklearn
import pandas as pd
import sklearn
import nltk
import matplotlib.pyplot as plt
nltk.download('punkt_tab')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score, precision_recall_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn import metrics

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\11897\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\11897\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\11897\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")
val_data = pd.read_csv("./data/validation.csv")

# Preprocess the dataset

In [4]:
stop_words = set(stopwords.words('english'))
def preprocess_sms(message):
    global stop_words
    message = message.lower() #converting to lowercase
    message = message.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    words = word_tokenize(message) #tokenize
    words = [word for word in words if word.isalnum() and word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

In [5]:
# converting text to vectors
vectorizer = TfidfVectorizer(analyzer=preprocess_sms)
X_train_tfidf = vectorizer.fit_transform(train_data['sms'])
X_val_tfidf = vectorizer.transform(val_data['sms'])
X_test_tfidf = vectorizer.transform(test_data['sms'])

y_train = train_data['class'].map({'spam': 1, 'ham': 0})
y_val = val_data['class'].map({'spam': 1, 'ham': 0})
y_test = test_data['class'].map({'spam': 1, 'ham': 0})

In [6]:
sms_tfidf_train = pd.DataFrame(X_train_tfidf.todense())
sms_tfidf_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7131,7132,7133,7134,7135,7136,7137,7138,7139,7140
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.270525,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.234342,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
sms_tfidf_test = pd.DataFrame(X_test_tfidf.todense())
sms_tfidf_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7131,7132,7133,7134,7135,7136,7137,7138,7139,7140
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
sms_tfidf_val = pd.DataFrame(X_val_tfidf.todense())
sms_tfidf_val.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7131,7132,7133,7134,7135,7136,7137,7138,7139,7140
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Build, track, and register 3 benchmark models using MLflow
- Checkout and print the model selection metric AUCPR for each of the three benchmark models

In [22]:
## k-NN 
with mlflow.start_run():
    knn = KNeighborsClassifier() # by default n_neighbours = 5
    knn.fit(X_train_tfidf, y_train)
    
    # Test predictions
    y_test_prob = knn.predict_proba(X_test_tfidf)[:, 1]
    
    # AUCPR 
    aucpr = sklearn.metrics.average_precision_score(y_test, y_test_prob)
    mlflow.log_param("n_neighbors", 5)
    mlflow.log_metric("AUCPR", aucpr)

    print("Model 1: kNN")
    print(f"AUCPR Score: {aucpr}")

    mlflow.sklearn.log_model(knn, "Knn")

Model 1: kNN
AUCPR Score: 0.7699041528050448




In [23]:
## Naive Bayes 
with mlflow.start_run():
    naiveBayes =  MultinomialNB()
    naiveBayes.fit(X_train_tfidf, y_train)
    
    # Test predictions
    y_test_prob = naiveBayes.predict_proba(X_test_tfidf)[:, 1]
    
    # AUCPR Score
    aucpr = sklearn.metrics.average_precision_score(y_test, y_test_prob)
    mlflow.log_metric("AUCPR", aucpr)

    print("Model 2: Naive Bayes")
    print(f"AUCPR Score: {aucpr}")

    mlflow.sklearn.log_model(naiveBayes, "Naive Bayes")

Model 2: Naive Bayes
AUCPR Score: 0.9439122697673518




In [27]:
## SVM
with mlflow.start_run():
    svm = SVC(kernel='linear')
    svm.fit(X_train_tfidf, y_train)
    
    # Test predictions
    y_pred = svm.predict(X_test_tfidf)
    
    # AUCPR Score
    aucpr = sklearn.metrics.average_precision_score(y_test, y_pred)
    mlflow.log_metric("AUCPR", aucpr)

    print("Model 3: SVM")
    print(f"AUCPR Score: {aucpr}")

    mlflow.sklearn.log_model(svm, "SVM")

Model 3: SVM
AUCPR Score: 0.8703227117693545




In [None]:
! mlflow ui