In [None]:
pip install nltk

In [None]:
pip install seaborn

In [None]:
pip install scikit-learn

In [None]:
pip install wordcloud

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import nltk
import seaborn as sns
import string
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
import pickle

In [3]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
df.shape

(5572, 5)

In [13]:
# 1. data cleaning 
# 2. EDA
# 3. Text preprocessing 
# 4. Model bulding 
# 5. Evaluation 
# 6. Impeovement 
# 7. Website
# 8. Deployment 

## 1. Data Cleaning 

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [15]:
#remove columns 
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace = True)

In [16]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
#Renaming columns 
df.rename(columns={'v1' : 'target', 'v2' : 'text'}, inplace = True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
spam_messages = df[df['target'].str.contains('spam', case=False)]


In [27]:
spam_messages

Unnamed: 0,target,text
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [None]:
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
# missing values
df.isnull().sum()

In [None]:
# Check for duplicates values
df.duplicated().sum()

In [None]:
# Remove duplicates
df = df.drop_duplicates(keep= 'first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

## 2. EDA

In [None]:
df.head()

In [None]:
plt.pie(df['target'].value_counts(), labels=['ham','spam'], autopct = '%0.2f')
plt.show()

In [None]:
# Data imbalanced

In [None]:
df['num_characters'] = df['text'].apply(len)

In [None]:
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sent']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# Ham messages Decs
df[df['target'] == 0][['num_characters','num_words','num_sent']].describe()

In [None]:
# Spam messages Decs
df[df['target'] == 1][['num_characters','num_words','num_sent']].describe()

In [None]:
plt.figure(figsize = (15,10))
sns.histplot(df[df['target'] == 0]['num_characters'])
sns.histplot(df[df['target'] == 1]['num_characters'],color = 'red')

In [None]:
plt.figure(figsize = (15,10))
sns.histplot(df[df['target'] == 0]['num_words'])
sns.histplot(df[df['target'] == 1]['num_words'])

In [None]:
plt.figure(figsize = (15,10))
sns.histplot(df[df['target'] == 0]['num_sent'])
sns.histplot(df[df['target'] == 1]['num_sent'])

In [None]:
# most of spam messages have less number of characters, words and sentenses 

In [None]:
sns.pairplot(df, hue = 'target')

In [None]:
corr = df[['target','num_characters','num_words','num_sent']]

In [None]:
sns.heatmap(corr.corr(), annot= True)

In [None]:
# It seems like there is multicollinearity in the variables, so we will proceed with only one column.

## Text Preprocessing 

In [None]:
# 1. Lower Case
# 2. Tokenization
# 3. Removing Special Characters
# 4. Removing Stop Words and Punctuation
# 5. Lemmatization

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
string.punctuation

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
ps = PorterStemmer()

In [None]:
ps.stem('dancing')

In [None]:
from nltk.stem import WordNetLemmatizer

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
nltk.download()

In [None]:
lemmatizer.lemmatize('loving')

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y =[]
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))


    return " ".join(y)
            

In [None]:
transform_text('Is that seriously how you spell his name?')

In [None]:
df['text'][20]

In [None]:
df['transform_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')
span_wc = wc.generate(df[df['target'] ==1]['transform_text'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(span_wc)

In [None]:

hamp_wc = wc.generate(df[df['target'] ==0]['transform_text'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(span_wc)

In [None]:
spam_corpus = []
for message in df[df['target']==1]['transform_text'].tolist():
    for words in message.split():
        spam_corpus.append(words)
        

In [None]:
len(spam_corpus)

In [None]:
sns.barplot(x=pd.DataFrame(Counter(spam_corpus).most_common(30))[0],y=pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation = 90)
plt.show()

In [None]:
ham_corpus = []
for message in df[df['target']==0]['transform_text'].tolist():
    for words in message.split():
        ham_corpus.append(words)

In [None]:
len(ham_corpus)

In [None]:
sns.barplot(x=pd.DataFrame(Counter(ham_corpus).most_common(30))[0],y=pd.DataFrame(Counter(ham_corpus).most_common(30))[1])
plt.xticks(rotation = 90)
plt.show()

## 4.Model Building

In [None]:
cv = CountVectorizer()

tfidf = TfidfVectorizer(max_features=3000)

In [None]:
X = tfidf.fit_transform(df['transform_text']).toarray()

In [None]:
X

In [None]:
y = df['target']. values

In [None]:
y

In [None]:
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
gnb =  GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(precision_score(y_test,y_pred))

In [None]:
mnb.fit(X_train,y_train)
y_pred1 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
bnb.fit(X_train,y_train)
y_pred2 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))
            

In [None]:
# Pipeline 
#1. Preprocessing (tranform)
#2. Vecterize
#3. perform algoritham 