**Import Necessary Library**

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from sklearn.preprocessing import LabelEncoder
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Load the Dataset**

In [3]:
df=pd.read_csv("/content/spam.csv",encoding="latin-1")

In [11]:
df.head() # it show only first five rows

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [27]:
df.isnull().sum() #check for missing values

Unnamed: 0,0
label,0
text,0
cleaned_text,0


In [7]:
df.shape # show no. of rows and columns in dataset

(5572, 5)

**Clean dataset**

In [12]:
# Remove other columns because that have many NAN values and columns are not useful
df=df[['v1','v2']]
df.columns=['label','text'] # rename the columns to label and text

In [37]:
df['label'].value_counts() # count the label in each class

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,4825
1,747


In [38]:
df.describe()

Unnamed: 0,label
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


**Preprocess the text data**

In [17]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
  text=text.lower()
  text = re.sub(r'[^0-9a-zA-Z]',' ',text)
  text = re.sub(r'\s+',' ',text)
  text = " ".join(word for word in text.split() if word not in STOPWORDS)
  return text

df['cleaned_text']=df['text'].apply(clean_text)

In [18]:
# labelencoder used to convert label into 0 or 1
la=LabelEncoder()
df['label']=la.fit_transform(df['label'])

In [19]:
df.head()

Unnamed: 0,label,text,cleaned_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


**TfidfVectorizer**

In [22]:
tfidf=TfidfVectorizer(stop_words='english',max_features=3000)
X=tfidf.fit_transform(df['cleaned_text']).toarray()
y=df['label']

**Train_Test split**

In [28]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

**Ensemble Model: Voting Classifier**

In [30]:
m1=MultinomialNB()
m2=LogisticRegression(max_iter=1000) # logisitic Regression model
m3=SVC(probability=True) # support vector classifier

In [32]:
# voting model that combine multiple base model
voting_m=VotingClassifier(estimators=[('nb',m1),('lr',m2),('svc',m3)],voting='soft')
voting_m.fit(X_train,y_train)

**Evaluation**

In [34]:
y_pred=voting_m.predict(X_test)

In [36]:
print("Accuracy:",accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Classification Report:\n",classification_report(y_test,y_pred))

Accuracy: 0.9775784753363229
Confusion Matrix:
 [[962   3]
 [ 22 128]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.85      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

