In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import joblib

In [58]:
df = pd.read_csv("spam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [60]:
df["Category"] = df["Category"].map({"spam" : 1 , "ham" : 0})
df

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [62]:
x = df["Message"]
y = df["Category"]

In [64]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [66]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [68]:
X_train_count

<3900x7262 sparse matrix of type '<class 'numpy.int64'>'
	with 52129 stored elements in Compressed Sparse Row format>

In [70]:
model = MultinomialNB()
model.fit(X_train_count, Y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [82]:
emails = [
    'Hey Mohamed, can we get together to watch football game tomorrow?',
   'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [84]:
X_test_count = v.transform(X_test)
model.score(X_test_count, Y_test)

0.9904306220095693

In [86]:
y_pre = model.predict(X_test_count)
y_pre

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [88]:
joblib.dump(v, "vectorizer.joblib")
joblib.dump(model, "spam_classifier.joblib")

['spam_classifier.joblib']

In [90]:
%%writefile app.py
import streamlit as st
import joblib

st.title("Spam Classifier")
st.write("Enter a text message and check if it's spam or ham")

v = joblib.load("vectorizer.joblib")
model = joblib.load("spam_classifier.joblib")
user_input = st.text_area("Write the message here: ")

if st.button("Result"):
    if user_input.strip() != "":
        input_count = v.transform([user_input])
        prediction = model.predict(input_count)[0]
        
        if prediction == 1:
            st.error("This message is SPAM")
        else:
            st.success("This message is HAM")
    else:
        st.warning("Please write a message first")

Overwriting app.py
