In [None]:
! pip install nltk
! pip install wordcloud

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("/content/spam.csv",encoding = 'latin-1')
print(data.head())
data.shape
data=data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
print(data)

import nltk
nltk.download('stopwords')

import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer

cols= ["#98568d", "#53fca1"]
plt.figure(figsize=(10,5))
fg=sns.countplot(x=data['v1'],legend = False,palette=cols)
fg.set_title("Count plot of classes")
fg.set_xlabel("Classes")
fg.set_ylabel("Number of data points")

nltk.download('punkt')
data["No_of_Characters"] = data['v2'].apply(len)
data["No_of_Words"]=data.apply(lambda row: nltk.word_tokenize(row['v2']), axis=1).apply(len)
data["No_of_sentence"]=data.apply(lambda row: nltk.sent_tokenize(row['v2']), axis=1).apply(len)
data.describe().T

plt.figure(figsize=(10,10))
fg= sns.pairplot(data=data,hue="v1",palette=cols)
plt.show(fg)

#dropping the outliers
data=data[(data['No_of_Characters']<350)]
data.shape

plt.figure(figsize=(10,10))
fg=sns.pairplot(data=data,hue='v1',palette=cols)
plt.show(fg)

stop_word = set(stopwords.words('english'))
word_cloud=WordCloud(width =800,height=800,max_words=200,stopwords=stop_word,background_color='black',max_font_size=200)
spam = data.query("v1=='spam'").v2.str.cat(sep=' ')
ham = data.query("v1=='ham'").v2.str.cat(sep=' ')

print("Spam")
word_cloud.generate(spam)
plt.figure(figsize=(16,8))
plt.imshow(word_cloud,interpolation='bilinear')
plt.axis("off")
plt.show()

print("ham")
word_cloud.generate(ham)
plt.figure(figsize=(16,8))
plt.imshow(word_cloud,interpolation='bilinear')
plt.axis("off")
plt.show()

print("The first 5 texts:",*data['v2'][:5],sep="\n")

#defining a function to clean the text
import re
def Cleaning(v2):
  sms=re.sub('[^a-zA-Z]',' ',v2) #replacing all non-alphabetic character with space
  sms=sms.lower()
  sms=sms.split()
  sms=' '.join(sms)
  return sms
data["clean_text"]=data["v2"].apply(Cleaning)
print("The first 5 text after cleaning ",*data['clean_text'][:5],sep='\n')

#tokenization
data["Tokenize_text"]=data.apply(lambda row: nltk.word_tokenize(row['clean_text']),axis=1)
print(data["Tokenize_text"])
print("The first 5 text after tokenizing: ",*data['Tokenize_text'][:5],sep='\n')

#Removing stopwords function
def remove_stopword(v2):
  stop_word=set(stopwords.words('english'))
  filtered_text=[]
  for word in v2:
    if word not in stop_word:
      filtered_text.append(word)
  return filtered_text
data["Nostopwords"]=data["Tokenize_text"].apply(remove_stopword)
print("First 5 text after removing stopword: ",*data['Nostopwords'][:5],sep='\n')

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
def lemmatize_word(v2):
    lemmas = []
    for word in v2:
      lemma = lemmatizer.lemmatize(word, pos='v')
      lemmas.append(lemma)
    #lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in v2]
    return lemmas

data["Lemmatized_text"] = data["Nostopwords"].apply(lemmatize_word)
print("The First 5 Texts after lemitization: ",*data["Lemmatized_text"][:5], sep = "\n")

#Creating a corpus of text feature to encode further into vectorized form
corpus=[]
for i in data['Lemmatized_text']:
  msg = ' '.join([row for row in i])
  corpus.append(msg)
corpus[:5]
print("The first 5 lines in corpus : ",*corpus[:5],sep='\n')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer()
x=tfidf.fit_transform(corpus).toarray()
x.dtype

label_encoder=LabelEncoder()
data['v1']=label_encoder.fit_transform(data['v1'])
#print(data.head)

y=data['v1']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print("Traning set shape ",x_train.shape)
print("testing set shape ",x_test.shape)

from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay

svm_model=SVC(kernel='linear',random_state=42)
svm_model.fit(x_train,y_train)
y_pred=svm_model.predict(x_test)
accuracy_svm=accuracy_score(y_test,y_pred)
classification=classification_report(y_test,y_pred,target_names=['Ham','Spam'])
print("Accuracy",accuracy_svm)
print("Classification Report :\n ",classification)

cm = confusion_matrix(y_test,y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
import matplotlib.pyplot as plt

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
roc_auc = auc(fpr, tpr)
print("AUC:", roc_auc)
