In [20]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re



df = pd.read_csv('C:/Users/lenovo/Downloads/SMSSpamCollection.txt', sep='\t', header=None, names=['Category', 'SMS'])

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

df['SMS'] = df['SMS'].apply(preprocess)


ham_messages = df[df['Category'] == 'ham']['SMS']
spam_messages = df[df['Category'] == 'spam']['SMS']

# Step 3: Calculate prior probabilities
p_ham = len(ham_messages) / len(df)
p_spam = len(spam_messages) / len(df)

# Step 4: Calculate word frequencies and likelihoods
ham_word_counts = defaultdict(int)
spam_word_counts = defaultdict(int)

# Count word occurrences
for message in ham_messages:
    for word in message.split():
        ham_word_counts[word] += 1

for message in spam_messages:
    for word in message.split():
        spam_word_counts[word] += 1

# Total number of words in each category
total_ham_words = sum(ham_word_counts.values())
total_spam_words = sum(spam_word_counts.values())

# Vocabulary size
vocab = set(list(ham_word_counts.keys()) + list(spam_word_counts.keys()))
vocab_size = len(vocab)

# Calculate likelihoods with Laplace smoothing
def calculate_likelihood(word_counts, total_words, word, vocab_size):
    return (word_counts[word] + 1) / (total_words + vocab_size)

# Step 5: Define function for Naive Bayes prediction
def classify_message(message):
    message = preprocess(message)
    ham_prob = np.log(p_ham)
    spam_prob = np.log(p_spam)
    
    for word in message.split():
        ham_prob += np.log(calculate_likelihood(ham_word_counts, total_ham_words, word, vocab_size))
        spam_prob += np.log(calculate_likelihood(spam_word_counts, total_spam_words, word, vocab_size))
    
    return 'ham' if ham_prob > spam_prob else 'spam'

# Test the classifier on sample messages
df['Predicted'] = df['SMS'].apply(classify_message)
print(df[['Category', 'Predicted']])


     Category Predicted
0         ham       ham
1         ham       ham
2        spam      spam
3         ham       ham
4         ham       ham
...       ...       ...
5567     spam      spam
5568      ham       ham
5569      ham       ham
5570      ham       ham
5571      ham       ham

[5572 rows x 2 columns]


In [21]:
import pandas as pd
import nltk
import re

In [22]:
import pandas as pd


with open("C:/Users/lenovo/Downloads/SMSSpamCollection.txt", "r") as f:
    lines = f.readlines()

data = [line.strip().split('\t') for line in lines]
df = pd.DataFrame(data, columns=['label', 'message'])



In [23]:
testing_data=df.iloc[int(0.75*len(df)):-1]
testing_data
training_data=df.iloc[:int(0.75*len(df))]


In [24]:
ham_mails=df[df["label"]=="ham"]
spam_mails=df[df["label"]=="spam"]
ham_mails

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...
...,...,...
5567,ham,Huh y lei...
5570,ham,Will Ã¼ b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [25]:

text=""
for i in training_data["message"]:
    text+=i

cleaned_text = re.sub(r'[^\w\s]', '', text)
words=cleaned_text.split()
words

['Go',
 'until',
 'jurong',
 'point',
 'crazy',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'Cine',
 'there',
 'got',
 'amore',
 'watOk',
 'lar',
 'Joking',
 'wif',
 'u',
 'oniFree',
 'entry',
 'in',
 '2',
 'a',
 'wkly',
 'comp',
 'to',
 'win',
 'FA',
 'Cup',
 'final',
 'tkts',
 '21st',
 'May',
 '2005',
 'Text',
 'FA',
 'to',
 '87121',
 'to',
 'receive',
 'entry',
 'questionstd',
 'txt',
 'rateTCs',
 'apply',
 '08452810075over18sU',
 'dun',
 'say',
 'so',
 'early',
 'hor',
 'U',
 'c',
 'already',
 'then',
 'sayNah',
 'I',
 'dont',
 'think',
 'he',
 'goes',
 'to',
 'usf',
 'he',
 'lives',
 'around',
 'here',
 'thoughFreeMsg',
 'Hey',
 'there',
 'darling',
 'its',
 'been',
 '3',
 'weeks',
 'now',
 'and',
 'no',
 'word',
 'back',
 'Id',
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still',
 'Tb',
 'ok',
 'XxX',
 'std',
 'chgs',
 'to',
 'send',
 'Â150',
 'to',
 'rcvEven',
 'my',
 'brother',
 'is',
 'not',
 'like',
 'to',
 'speak',


In [26]:
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
from nltk.corpus import wordnet

# Download necessary data
nltk.download('wordnet')
nltk.download('omw-1.4')  # for additional WordNet data

# Initialize lemmatizer and stemmers
lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

# Example words
# words = ["running", "ran", "easily", "fairly", "better"]

# Applying lemmatization

# Applying stemming
porter_stemmed_words = [porter_stemmer.stem(word) for word in words]
print("Porter Stemmed words:", porter_stemmed_words)

lancaster_stemmed_words = [lancaster_stemmer.stem(word) for word in porter_stemmed_words]
print("Lancaster Stemmed words:", lancaster_stemmed_words)
lemmatized_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in lancaster_stemmed_words]
print("Lemmatized words:", lemmatized_words)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Porter Stemmed words: ['go', 'until', 'jurong', 'point', 'crazi', 'avail', 'onli', 'in', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amor', 'watok', 'lar', 'joke', 'wif', 'u', 'onifre', 'entri', 'in', '2', 'a', 'wkli', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkt', '21st', 'may', '2005', 'text', 'fa', 'to', '87121', 'to', 'receiv', 'entri', 'questionstd', 'txt', 'ratetc', 'appli', '08452810075over18su', 'dun', 'say', 'so', 'earli', 'hor', 'u', 'c', 'alreadi', 'then', 'saynah', 'i', 'dont', 'think', 'he', 'goe', 'to', 'usf', 'he', 'live', 'around', 'here', 'thoughfreemsg', 'hey', 'there', 'darl', 'it', 'been', '3', 'week', 'now', 'and', 'no', 'word', 'back', 'id', 'like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still', 'tb', 'ok', 'xxx', 'std', 'chg', 'to', 'send', 'â150', 'to', 'rcveven', 'my', 'brother', 'is', 'not', 'like', 'to', 'speak', 'with', 'me', 'they', 'treat', 'me', 'like', 'aid', 'patenta', 'per', 'your', 'request', 'mell', 'mell', 'oru

In [27]:
vocubulary=list(set(lemmatized_words))

In [28]:
n=len(vocubulary)
n

9144

In [29]:
zero_dict=pd.DataFrame()
for i in vocubulary:
    zero_dict[i]=[0]*n
zero_dict

  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i]=[0]*n
  zero_dict[i

Unnamed: 0,messageforward,wwwldewcomsubs161win150ppmx3th,um,landlineonlyhad,doc,09094100151,thk,grant,shortnew,kis,...,unsoldh,knowth,homecheck,porn,friendssorr,symbol,pub,appt,persay,housegim
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9139,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9140,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
c=zero_dict[:4181]
c

Unnamed: 0,messageforward,wwwldewcomsubs161win150ppmx3th,um,landlineonlyhad,doc,09094100151,thk,grant,shortnew,kis,...,unsoldh,knowth,homecheck,porn,friendssorr,symbol,pub,appt,persay,housegim
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4177,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:

training_data.loc[:, 'w'] = training_data['message'].str.split()
d=pd.DataFrame(training_data,c)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data.loc[:, 'w'] = training_data['message'].str.split()


ValueError: Index data must be 1-dimensional

In [None]:
training_data

Unnamed: 0,label,message,w
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, until, jurong, point,, crazy.., Available..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar..., Joking, wif, u, oni...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, so, early, hor..., U, c, already..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, I, don't, think, he, goes, to, usf,, he,..."
...,...,...,...
4175,ham,And pls pls drink plenty plenty water,"[And, pls, pls, drink, plenty, plenty, water]"
4176,ham,How are you doing. How's the queen. Are you go...,"[How, are, you, doing., How's, the, queen., Ar..."
4177,ham,He's in lag. That's just the sad part but we k...,"[He's, in, lag., That's, just, the, sad, part,..."
4178,ham,Ok lor then we go tog lor...,"[Ok, lor, then, we, go, tog, lor...]"


In [None]:
def clean(text):
    return re.sub(r'[^\w\s]', '', text)