# Develop a simple Message Classifier

### libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib
from pathlib import Path

In [2]:
cwd = Path.cwd().parent
csv_data = cwd / 'data' / 'spam.csv'
spam_data = pd.read_csv(csv_data, encoding="latin-1")
spam_data.sample(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
3266,ham,Ok then i come n pick u at engin?,,,
1212,ham,"Yo, the game almost over? Want to go to walmar...",,,
1066,ham,Once free call me sir. I am waiting for you.,,,


In [3]:
spam_data.iloc[2].v2

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [4]:
# Check the v1 column values
spam_data.v1.unique()

array(['ham', 'spam'], dtype=object)

**Observation:** we have two labels, which corresponds to a 2-class classification problem.  
Then, we need to encode those two labels into numerical format for model training.  

In [5]:
# Encode the ham and spam
spam_data['label'] = spam_data['v1'].map({'ham': 0, 'spam': 1})
spam_data.sample(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,label
3631,ham,Oooh I got plenty of those!,,,,0
4584,spam,U have a Secret Admirer who is looking 2 make ...,,,,1
938,spam,Urgent! call 09061749602 from Landline. Your c...,,,,1


**Observation:** we created the new column and we are now going to keep the following two columns:  
- **v2** for X   
- **label**  for y

In [6]:
X = spam_data['v2']
y = spam_data['label']

### Model Training.    
Since the goal is mainly about deploying the model with streamlit, we won't spend time on finding the best model, but a very simple one (Naive Bayes in our case), in order to continue the deployment process.  

In [7]:
cv = CountVectorizer()
X = cv.fit_transform(X) # Fit the Data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2021)

#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1594
           1       0.95      0.93      0.94       245

    accuracy                           0.98      1839
   macro avg       0.97      0.96      0.96      1839
weighted avg       0.98      0.98      0.98      1839



In [8]:
# Persist the model
spam_detect_pkl = cwd / 'ml_models' / 'spam_detector_model.pkl'
joblib.dump(clf, spam_detect_pkl)

['/mnt/sda3/home/marcelbittar/pyproj/spam_ml/app/ml_models/spam_detector_model.pkl']