In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [137]:
# Import essential libraries for data manipulation, visualization, natural language processing, model training, and evaluation.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from  sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
 


In [8]:

# Load the SMS spam collection dataset into a pandas DataFrame from a CSV file with ISO-8859-1 encoding.
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='ISO-8859-1')


In [9]:
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [15]:
df.size

27860

In [16]:
df.shape

(5572, 5)

In [10]:
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [11]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [14]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [27]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
5567    False
5568    False
5569    False
5570    False
5571    False
Length: 5572, dtype: bool

In [23]:
df['Unnamed: 2'].unique

<bound method Series.unique of 0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
5567    NaN
5568    NaN
5569    NaN
5570    NaN
5571    NaN
Name: Unnamed: 2, Length: 5572, dtype: object>

In [28]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [29]:
# Remove unnecessary columns ('Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4') from the DataFrame to clean the dataset.
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)

In [30]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [31]:
df['v2'].size

5572

In [34]:
df['v2'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [35]:
df.columns

Index(['v1', 'v2'], dtype='object')

In [36]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [37]:
df.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [39]:
# Encode the 'v1' column (ham/spam labels) into numerical format (0 for ham, 1 for spam) using LabelEncoder.
encoding = LabelEncoder()
df['v1'] = encoding.fit_transform(df['v1'])

In [40]:
df.head(10)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [41]:
# Convert all text in the 'v2' column (SMS messages) to lowercase for uniformity in text processing.
df['v2'].str.lower()

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                will ì_ b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
Name: v2, Length: 5572, dtype: object

In [42]:
# Define a function to remove URLs from the given text using a regular expression.

def removeurl(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r' ',text)

In [43]:
# Apply the removeurl function to the 'v2' column to remove any URLs from the SMS messages.
df['v2']=df['v2'].apply(removeurl)

In [44]:

# Define a function to remove HTML tags from the given text using a regular expression.

def removehtml(text):
    html = re.compile(r'<.*?>')
    return html.sub(r' ',text)
    

In [47]:
# Apply the removehtml function to the 'v2' column to remove any HTML tags from the SMS messages.
df['v2']= df['v2'].apply(removehtml)

In [46]:
# Define a function to remove punctuation from the given text using the string module.
def removepunctuation(text):
    punc = string.punctuation
    return text.translate(str.maketrans('','',punc))

In [48]:
# Apply the removepunctuation function to the 'v2' column to remove any punctuation from the SMS messages.
df['v2']= df['v2'].apply(removepunctuation)

In [52]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
stop_word = set(stopwords.words("english"))

In [61]:
# Define a function to remove stopwords from the given text using NLTK's word_tokenize and a predefined list of stopwords.
def remove_stopwords(text):
    token = word_tokenize(text)
    return " ".join([word for word in token if word.lower() not in stop_word])
    
    
            
            

In [62]:
# Apply the remove_stopwords function to the 'v2' column to eliminate stopwords from the SMS messages.
df['v2'] = df['v2'].apply(remove_stopwords)

In [67]:
ps = PorterStemmer()

In [68]:
# Define a function to perform stemming on the given text using NLTK's PorterStemmer.
def stemming(text):
    token = word_tokenize(text)
    return " ".join([ps.stem(word) for word in token])

In [69]:
# Apply the stemming function to the 'v2' column to reduce words to their root forms in the SMS messages.
df['v2'] = df['v2'].apply(stemming)

In [70]:
df['v2'][4]

'nah dont think goe usf live around though'

In [101]:
# Assign the preprocessed SMS messages to the feature variable X and the encoded labels (ham/spam) to the target variable y.
X = df['v2']
y = df['v1']

In [94]:
X.shape

(5572,)

In [102]:
torizer = TfidfVectorizer()  # Adjust max_features as needed
X = vectorizer.fit_transform(X)

# X_train_tfidf will be a sparse matrix, which is acceptable for most ML models

In [103]:
# Apply SMOTE to oversample the minority class in the dataset, balancing the feature variable X and target variable y.
smote = SMOTE(sampling_strategy="auto")
X_resample, y_resample = smote.fit_resample(X,y)

In [79]:
X_resample.size

140561

In [80]:
y_resample.value_counts()

v1
0    4825
1    4825
Name: count, dtype: int64

In [104]:
# Update the feature variable X and target variable y with the resampled data from SMOTE.
X = X_resample
y = y_resample

In [105]:
# Split the dataset into training and testing sets, using 25% of the data for testing 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [106]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 105434 stored elements and shape (7237, 7971)>

In [107]:
X_train.shape

(7237, 7971)

In [108]:
y_train.shape

(7237,)

In [125]:
# Initialize a Support Vector Classifier and fit it to the training data (X_train) and labels (y_train).
svc = SVC()
svc.fit(X_train,y_train)

In [134]:
# Use the trained Support Vector Classifier to make predictions on the test data (X_test).
test_predict= svc.predict(X_test)

In [135]:
# Calculate the accuracy of the Support Vector Classifier by comparing predicted labels with true labels (y_test) and convert it to a percentage.
accuracy = accuracy_score(y_test,test_predict)
accuracy  = accuracy*100
print(f"Test accuracy for SupportVectorMachine is", accuracy,"%")

Test accuracy for SupportVectorMachine is 99.70990468296726 %


In [140]:
print(classification_report(y_test,test_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1226
           1       1.00      1.00      1.00      1187

    accuracy                           1.00      2413
   macro avg       1.00      1.00      1.00      2413
weighted avg       1.00      1.00      1.00      2413



In [128]:
# Initialize a Multinomial Naive Bayes classifier and fit it to the training data (X_train) and labels (y_train).
NB = MultinomialNB()
NB.fit(X_train,y_train)

In [129]:
# Use the trained Multinomial Naive Bayes classifier to make predictions on the test data (X_test).
predict_NB = NB.predict(X_test)

In [130]:
# Calculate the accuracy of the Multinomial Naive Bayes classifier by comparing predicted labels with true labels (y_test) and convert it to a percentage.
accuracy = accuracy_score(y_test,predict_NB)
accuracy  = accuracy*100
print(f"Test accuracy for Naive_bayes is", accuracy,"%")

accuracy for Naive_bayes is 98.63240779113137 %


In [131]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [132]:
predict_lr = lr.predict(X_test)

In [133]:
accuracy = accuracy_score(y_test,predict_lr)
accuracy  = accuracy*100
print(f" Test accuracy for LogisticRegression is", accuracy,"%")

accuracy for LogisticRegression is 98.4251968503937 %
