# **TEXT SPAM CLASSIFIER **

In [1]:
import string

import pandas as pd
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import  PorterStemmer

from wordcloud import WordCloud

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import pickle

In [2]:
df_email = pd.read_csv(
    "/kaggle/input/datasets/venky73/spam-mails-dataset/spam_ham_dataset.csv"
)

df_email.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
df_email.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [4]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding='latin-1')

In [14]:
# Prepare data for email
df_email_clean = df_email[["label", "text"]].copy()
df_email_clean.columns = ["label", "text"]

In [9]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [15]:
# prepare sms data in same format
df_sms_clean = df[["v1", "v2"]].copy()
df_sms_clean.columns = ["label", "text"]

In [17]:
df_all = pd.concat(
    [df_sms_clean, df_email_clean],
    axis=0,
    ignore_index=True
)

In [20]:
df_all["label"] = df_all["label"].map({
    "ham": 0,
    "spam": 1
})

In [22]:
# Remove missing rows (safety)
df_all = df_all.dropna(subset=["text", "label"])


In [23]:
df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)

In [24]:
df_all.head()

Unnamed: 0,label,text
0,0,Subject: fw : epgt\r\ndaren - can you please l...
1,0,Should i send you naughty pix? :)
2,0,Going to take your babe out ?
3,0,Subject: meter # : 6599\r\ndeal 138049 is comm...
4,1,Knock Knock Txt whose there to 80082 to enter ...


In [27]:
df = df_all.copy()

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10743 entries, 0 to 10742
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   10743 non-null  int64 
 1   text    10743 non-null  object
dtypes: int64(1), object(1)
memory usage: 168.0+ KB


In [32]:
df = df.rename(columns={"label": "target"})

In [33]:
# To convert values into boolean
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [30]:
df["label"]

0        0
1        0
2        0
3        0
4        1
        ..
10738    1
10739    0
10740    0
10741    0
10742    1
Name: label, Length: 10743, dtype: int64

In [None]:
#Missing values
df.isnull().sum()

In [None]:
# Check for duplicates
df.duplicated().sum()

In [None]:
# Remove duplicates
df = df.drop_duplicates(keep='first')
df.duplicated().sum()

In [None]:
df.shape

# EDA

In [None]:
df['target'].value_counts()
# Ham is more than spam

In [None]:
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.show()

# Conclusion - Data in imbalance

In [None]:
# Length of each message
df['num_characters'] = df['text'].apply(len)
df.head()

In [None]:
# No. of words
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))
df.head()

In [None]:
# No. of sentences
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))
df.head()

In [None]:
df.describe()

In [None]:
# describe for ham
df[df['target']==0][['target','num_characters','num_words','num_sentences']].describe()

In [None]:
# describe for apam
df[df['target']==1][['target','num_characters','num_words','num_sentences']].describe()


> From above analysis we can clearly see that avg words of spam are 27 and ham are 17, characters also 137 and 27 respectively.Which means spam messages are lengthy


In [None]:
sns.histplot(df[df['target'] == 0]['num_characters'])
sns.histplot(df[df['target'] == 1]['num_characters'],color='red')

In [None]:
sns.histplot(df[df['target'] == 0]['num_words'])
sns.histplot(df[df['target'] == 1]['num_words'],color='red')

In [None]:
sns.pairplot(df,hue='target')

In [None]:
df.info()

In [None]:
# Can only apply on nnumber fields
num_df = df.select_dtypes(include="number")

num_df.corr()

sns.heatmap(num_df.corr(),annot=True)

> Variation of num_characters with target is highest so we can use to create a model

# Data Preprocessing

In [None]:
ps = PorterStemmer()
ps.stem("Loving")

In [None]:
def transform_text(text):
    # Convert in lowercase
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    # Use set below to avoid nested loops
    stop_words = set(stopwords.words("english"))

    y = []
    for t in tokens:
        # if special chars then don't append
        # if t.isalnum() and t not in stopwords.words('english') and t not in string.punctuation:
        # if t.isalnum() and t not in stopwords.words('english'): # no need of string.punctuation
        if t.isalnum() and t not in stop_words: # no need to stopwords list
            y.append(ps.stem(t))
        
    return " ".join(y)


In [None]:
text = "How was your yesterday's meeting? was that good?"
transform_text(text)

In [None]:
text = "How was your meeting bro?"
transform_text(text)

In [None]:
# remove stop words
stopwords.words('english')

In [None]:
# punctuation marks
string.punctuation

In [None]:
df['transformed_text'] = df['text'].apply((transform_text))

In [None]:
df.head()

In [None]:
# To hhighlight the important words
wc = WordCloud(width=1000,height=1000,min_font_size=10,background_color='white')

In [None]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=" "))
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep=" "))
plt.imshow(ham_wc)

In [None]:
# All data in one list
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)
len(spam_corpus)

In [None]:
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)
len(ham_corpus)

**Top spam words chart**

In [None]:
top30Spam = pd.DataFrame(
    Counter(spam_corpus).most_common(30),
    columns=["word", "count"]
)

plt.figure(figsize=(10,6))
sns.barplot(data=top30Spam, x="word", y="count")
plt.xticks(rotation=90)
plt.show()

In [None]:
top30Ham = pd.DataFrame(
    Counter(ham_corpus).most_common(30),
    columns=["word", "count"]
)

plt.figure(figsize=(10,6))
sns.barplot(data=top30Ham, x="word", y="count")
plt.xticks(rotation=90)
plt.show()

# Building Model

# 1.Naive Bayes

In [None]:
# cv = CountVectorizer()
# tfidf = TfidfVectorizer()
tfidf = TfidfVectorizer(max_features=3000)
# Features
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
scalar = MinMaxScaler()
X = scalar.fit_transform(X)

In [None]:
# Target
y = df['target'].values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Random split
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
# stratified split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2,stratify=y)

> precision_score of BernoulliNB was 97 with random split and 98 with stratified split

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
# Create Gaussian model
gnb.fit(X_train,y_train)

In [None]:
y_pred1 = gnb.predict(X_test)
print(f"accuracy_score -- {accuracy_score(y_test,y_pred1)}")
print(f"confusion_matrix--{confusion_matrix(y_test,y_pred1)}")
print(f"precision_score -- {precision_score(y_test,y_pred1)}")

In [None]:
# Create Multinomial model
mnb.fit(X_train,y_train)

In [None]:
y_pred2 = mnb.predict(X_test)
print(f"accuracy_score -- {accuracy_score(y_test,y_pred2)}")
print(f"confusion_matrix--{confusion_matrix(y_test,y_pred2)}")
print(f"precision_score -- {precision_score(y_test,y_pred2)}")

> mnb precision was 87 with countervectorizer but 1 with tfidf so we will keep tfidf mnb

precision_score -- 1.0

> mnb accuracy was 96 with tfidf all features but now 98 with tfidf 3k features so we will keep tfidf max features 3k

accuracy_score -- 0.9825918762088974

> mnb accuracy improved after scaling

accuracy_score -- 0.988394584139265


In [None]:
# Create Burnauli model
bnb.fit(X_train,y_train)

In [None]:
y_pred3 = bnb.predict(X_test)
print(f"accuracy_score -- {accuracy_score(y_test,y_pred3)}")
print(f"confusion_matrix--{confusion_matrix(y_test,y_pred3)}")
print(f"precision_score -- {precision_score(y_test,y_pred3)}")

In [None]:

pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))