In [26]:
#Use-case: SMS Spam Classification
#Goal: You need to create a model that can predict whether the given sms is a spam or ham sms

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords for NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oysterable/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
data = pd.read_csv('./Datasets/SMSSpamCollection' , sep='\t', names=['label','message'])

In [29]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [31]:
data.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [32]:
#Preprocess the text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d', '', text)  # Remove numbers
    stop_words = set(stopwords.words('english'))
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])
    return text

In [33]:
data['message'] = data['message'].apply(preprocess_text)
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

In [34]:
#Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)



In [35]:
#Vectorize the text using TF-IDF (Term Frequency-Inverse Document Frequency)
vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [36]:
#Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [37]:
print(f"Testing score is {model.score(X_test_tfidf,y_test)} and TrainingScore is {model.score(X_train_tfidf,y_train)} ")

Testing score is 0.9721973094170404 and TrainingScore is 0.9739735247924612 


In [38]:
user_input = input("Enter an SMS message to classify: ")
processed_input = preprocess_text(user_input)  # Preprocess the user input
input_tfidf = vectorizer.transform([processed_input])  # Transform the input to the same TF-IDF format
prediction = model.predict(input_tfidf)

if prediction[0] == 1:
  print("This message is SPAM.")
else:
  print("This message is NOT SPAM.")


This message is NOT SPAM.


In [39]:
prediction

array([0])

In [40]:
import joblib
joblib.dump(model, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [41]:
!pip install -q streamlit

In [42]:
%%writefile app.py

import streamlit as st
import joblib
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if needed
nltk.download('stopwords')

# Load the pre-trained model and vectorizer
model = joblib.load('spam_classifier_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Preprocess the text similar to how the model was trained
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d', '', text)  # Remove numbers
    stop_words = set(stopwords.words('english'))
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])
    return text

# Streamlit App UI
st.title("SMS Spam Classifier")

# Text input
user_input = st.text_input("Enter an SMS message:")

if st.button("Predict"):
    if user_input:
        # Preprocess user input
        processed_input = preprocess_text(user_input)

        # Vectorize the input text
        input_tfidf = vectorizer.transform([processed_input])

        # Get the prediction
        prediction = model.predict(input_tfidf)

        # Output result
        if prediction[0] == 1:
            st.write("This message is **SPAM**.")
        else:
            st.write("This message is **NOT SPAM**.")
    else:
        st.write("Please enter a message to classify.")


Overwriting app.py


In [43]:
!npm install localtunnel

zsh:1: command not found: npm


In [44]:
import urllib
print("Password/Enpoint IP for localtunnel is:",urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n"))

Password/Enpoint IP for localtunnel is: 172.56.234.148


In [45]:
!streamlit run app.py &>/content/logs.txt & npx localtunnel --port 8501

zsh:1: no such file or directory: /content/logs.txt
zsh:1: command not found: npx


In [25]:
# https://chatty-crabs-push.loca.lt
# Password/Enpoint IP for localtunnel is: 35.229.159.74