In [1]:
# Importing necessary libraries from the NLTK toolkit
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # For tokenizing text into words and sentences

# Importing stopwords from NLTK to remove common words that add little value
from nltk.corpus import stopwords

# Downloading required NLTK datasets
nltk.download('punkt')  # Tokenizer models for sentence and word tokenization
nltk.download('punkt_tab')  # Optional: Extra support for tokenization
nltk.download('stopwords')  # Predefined stopword lists for various languages

# Downloading the dataset from Kaggle using Kaggle CLI
!kaggle datasets download -d abdallahwagih/spam-emails  # Dataset containing spam emails
!unzip spam-emails.zip  # Extracting the downloaded dataset

# Importing pandas for working with data in tabular format
import pandas as pd

# Loading the CSV dataset into a DataFrame
df = pd.read_csv("spam.csv")  # CSV contains columns like 'Message' and labels indicating spam or not

# Step to clean the text data:
# - Removing punctuation, special characters, and multiple spaces
# - Preparing data for tokenization and further text processing

import re  # Regular expressions for text cleaning

cleaned = []  # List to store cleaned text
for text in df['Message']:  # Looping through each message in the 'Message' column
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Removing all characters except words and spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replacing multiple spaces with a single space
    cleaned_data = cleaned_text.strip()  # Stripping leading and trailing whitespace
    cleaned.append(cleaned_data)  # Adding the cleaned text to the list

# Tokenizing the cleaned text into words
# This step splits each cleaned text into a list of words
tokens = [word_tokenize(x) for x in cleaned]

# Removing stopwords from tokenized words
# Stopwords are commonly used words like "is", "the", "and", etc., which are removed to reduce noise
stop = set(stopwords.words('english'))  # Fetching the list of English stopwords
stpktn = []  # List to store stopword-removed tokens
for k in range(len(df['Message'])):  # Loop through the tokenized text
    p = [i for i in tokens[k] if i not in stop]  # Filter out tokens that are in the stopword list
    stpktn.append(p)  # Append the filtered tokens to the list

# Summary of steps:
# 1. Dataset is downloaded and loaded into a pandas DataFrame.
# 2. Text messages are cleaned by removing punctuation, special characters, and extra spaces.
# 3. The cleaned text is tokenized into words.
# 4. Stopwords are removed to focus on meaningful words.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dataset URL: https://www.kaggle.com/datasets/abdallahwagih/spam-emails
License(s): apache-2.0
Downloading spam-emails.zip to /content
  0% 0.00/207k [00:00<?, ?B/s]
100% 207k/207k [00:00<00:00, 55.9MB/s]
Archive:  spam-emails.zip
  inflating: spam.csv                


In [2]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()


In [3]:
stemmed_tokens = []
for message in stpktn:
    st=[ps.stem(word) for word in message]
    stemmed_tokens.append(st)

In [4]:
stemmed_tokens

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'u', 'oni'],
 ['free',
  'entri',
  '2',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receiv',
  'entri',
  'questionstd',
  'txt',
  'ratetc',
  'appli',
  '08452810075over18'],
 ['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say'],
 ['nah', 'i', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  '3',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  '150',
  'rcv'],
 ['even',
  'brother',
  'like',
  'speak',
  'they',
  'treat',
  'like',
  'aid',
  'patent'],
 ['as',
  'per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'call

In [5]:
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [6]:
pos_tag(['studying','running','aiml'])

[('studying', 'VBG'), ('running', 'VBG'), ('aiml', 'NN')]

In [7]:
pos_token=[pos_tag(message) for message in stpktn ]
pos_token

[[('Go', 'VB'),
  ('jurong', 'JJ'),
  ('point', 'NN'),
  ('crazy', 'NN'),
  ('Available', 'NNP'),
  ('bugis', 'NN'),
  ('n', 'RB'),
  ('great', 'JJ'),
  ('world', 'NN'),
  ('la', 'NN'),
  ('e', 'VBP'),
  ('buffet', 'JJ'),
  ('Cine', 'NNP'),
  ('got', 'VBD'),
  ('amore', 'RB'),
  ('wat', 'JJ')],
 [('Ok', 'NNP'),
  ('lar', 'JJ'),
  ('Joking', 'NNP'),
  ('wif', 'NN'),
  ('u', 'NN'),
  ('oni', 'NN')],
 [('Free', 'JJ'),
  ('entry', 'NN'),
  ('2', 'CD'),
  ('wkly', 'JJ'),
  ('comp', 'NN'),
  ('win', 'VBP'),
  ('FA', 'NNP'),
  ('Cup', 'NNP'),
  ('final', 'JJ'),
  ('tkts', 'NN'),
  ('21st', 'CD'),
  ('May', 'NNP'),
  ('2005', 'CD'),
  ('Text', 'NNP'),
  ('FA', 'NNP'),
  ('87121', 'CD'),
  ('receive', 'JJ'),
  ('entry', 'NN'),
  ('questionstd', 'NN'),
  ('txt', 'NN'),
  ('rateTCs', 'NN'),
  ('apply', 'VBP'),
  ('08452810075over18s', 'CD')],
 [('U', 'JJ'),
  ('dun', 'NNS'),
  ('say', 'VBP'),
  ('early', 'JJ'),
  ('hor', 'NN'),
  ('U', 'NNP'),
  ('c', 'NN'),
  ('already', 'RB'),
  ('say', 'VB')],

In [8]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer=WordNetLemmatizer()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [9]:
lemmatizer.lemmatize('studying','v')

'study'

In [10]:
from nltk.corpus import wordnet
nltk.download('wordnet')
def get_wordnet(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
pos_token[0][0]

('Go', 'VB')

In [12]:
get_wordnet(pos_token[0][0][1])

'v'

In [13]:
lem_data=[]
for x in range(len(pos_token)):
    lem=[lemmatizer.lemmatize(pos_token[x][i][0],get_wordnet(pos_token[x][i][1])) for i in range(len(pos_token[x]))]
    lem_data.append(lem)

In [14]:
lem_data

[['Go',
  'jurong',
  'point',
  'crazy',
  'Available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'Cine',
  'get',
  'amore',
  'wat'],
 ['Ok', 'lar', 'Joking', 'wif', 'u', 'oni'],
 ['Free',
  'entry',
  '2',
  'wkly',
  'comp',
  'win',
  'FA',
  'Cup',
  'final',
  'tkts',
  '21st',
  'May',
  '2005',
  'Text',
  'FA',
  '87121',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'rateTCs',
  'apply',
  '08452810075over18s'],
 ['U', 'dun', 'say', 'early', 'hor', 'U', 'c', 'already', 'say'],
 ['Nah', 'I', 'dont', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['FreeMsg',
  'Hey',
  'darling',
  '3',
  'week',
  'word',
  'back',
  'Id',
  'like',
  'fun',
  'still',
  'Tb',
  'ok',
  'XxX',
  'std',
  'chgs',
  'send',
  '150',
  'rcv'],
 ['Even',
  'brother',
  'like',
  'speak',
  'They',
  'treat',
  'like',
  'aid',
  'patent'],
 ['As',
  'per',
  'request',
  'Melle',
  'Melle',
  'Oru',
  'Minnaminunginte',
  'Nurungu',
  'Vettam',
  'set',
  'c

In [15]:
stemmed_tokens

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'u', 'oni'],
 ['free',
  'entri',
  '2',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receiv',
  'entri',
  'questionstd',
  'txt',
  'ratetc',
  'appli',
  '08452810075over18'],
 ['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say'],
 ['nah', 'i', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  '3',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  '150',
  'rcv'],
 ['even',
  'brother',
  'like',
  'speak',
  'they',
  'treat',
  'like',
  'aid',
  'patent'],
 ['as',
  'per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'call

In [16]:
' '.join(stemmed_tokens[0])

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [17]:
stem_vec=[''.join(message) for message in stemmed_tokens]

In [18]:
stem_vec

['gojurongpointcraziavailbugingreatworldlaebuffetcinegotamorwat',
 'oklarjokewifuoni',
 'freeentri2wklicompwinfacupfinaltkt21stmay2005textfa87121receiventriquestionstdtxtratetcappli08452810075over18',
 'udunsayearlihorucalreadisay',
 'nahidontthinkgoeusflivearoundthough',
 'freemsgheydarl3weekwordbackidlikefunstilltbokxxxstdchgsend150rcv',
 'evenbrotherlikespeaktheytreatlikeaidpatent',
 'asperrequestmellmelloruminnaminungintnurunguvettamsetcallertuncallerpress9copifriendcallertun',
 'winnerasvalunetworkcustomselectreceivea900prizerewardtoclaimcall09061701461claimcodekl341valid12hour',
 'hadmobil11monthurentitlupdatlatestcolourmobilcamerafreecallthemobilupdatcofree08002986030',
 'imgonnahomesoondontwanttalkstuffanymortonightkivecrienoughtoday',
 'sixchancwincashfrom10020000poundtxtcsh11send87575cost150pday6day16tsandcapplireplihl4info',
 'urgentyou1weekfreemembership100000prizejackpottxtwordclaimno81010tcwwwdbuknetlccltdpobox4403ldnw1a7rw18',
 'ivesearchrightwordthankbreatheripromiswont

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [20]:
x=cv.fit_transform(stem_vec)

In [21]:
x=cv.fit_transform(stem_vec).toarray()

In [22]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
#ML ALGORITHMS(IMPORTANT IN PERSPECTIVE OF INTERVIEWS)
#SUPERVISED ML
#1.Linear Regression
#2.Logistic regression
#3.Ridge Regression
#4.Lasso Regression
#5.KNN regression
#6.KNN Classifier
#7.Naive Bayes
#8.SVM
#9.SVC
#10.Decision Tree Regression
#11.Decision Tree Classifier
#12.Ensemble(Bagging)
#13.Random forest Regression
#14.Random forest classifier
#15.Ensemble(Boosting)
#16.Gradient Boost Regression
#17.Gradient Boost Classifier
#18.XG boost Regression
#19.XG Boost Classifier
#20.Light BGM
#UNSUPERVISED ML
#1.K means clustering
#2.Hierarchical Clustering - Agglomerative
#3.DB SCAN
#4.Dimensionality reduction
#5.PCA(Unsupervised)
#6.LDA(Supervised)

In [24]:
y = df['Category']

In [25]:
#IMPLEMENTING SUPERVISED ML (WE USE NAIVE BAYES HERE WHICH PREDICTS THE MAIL IS SPAM OR NOT)
#importing multinomialnb from sklearn(naiv_bayes)
from sklearn.naive_bayes import MultinomialNB

In [26]:
mb=MultinomialNB()

In [27]:
mb.fit(x,y)

In [28]:
df['Message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [29]:
mb.predict([x[0]])

array(['ham'], dtype='<U4')