In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re


In [None]:
try:
    df = pd.read_csv('spam.csv', encoding='utf-8')
except UnicodeDecodeError:
    # If a UnicodeDecodeError occurs, try reading with 'latin1' encoding
    df = pd.read_csv('spam.csv', encoding='latin1')

In [None]:
df

In [None]:
df.shape

# Data Cleaning

In [None]:
df.info()

# Drop last three cols


In [None]:
df = df.drop(columns = ['Unnamed: 2','Unnamed: 3','Unnamed: 4'])

# Renaming cols

In [None]:
df = df.rename(columns = {'v1':'target','v2':'text'})

In [None]:
df

## Encoding the Target Column

In [None]:
encoder = LabelEncoder()

In [None]:
df['target_encoded'] = encoder.fit_transform(df['target'])

## Check for Missing values

In [None]:
df.isnull().sum()

## Check for Duplicate values

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df.shape

# EDA

In [None]:
df['target'].value_counts()

In [None]:
plt.pie(df['target'].value_counts(),labels = ['ham','spam'], autopct = "%0.2f")

In [None]:
## Data is imbalanced

In [None]:
df['num_char'] = df['text'].apply(len)

In [None]:
df['word_tokenize'] = df['text']
df = df.drop(columns = ['word_tokenize'])

In [None]:
def word_tokenize(col):    
    return len(nltk.word_tokenize(col))

In [None]:
df['num_words'] = df['text'].apply(word_tokenize)

In [None]:
df

In [None]:
def word_tokenize(col):    
    return len(nltk.sent_tokenize(col))

In [None]:
df['num_sentences'] = df['text'].apply(word_tokenize)

In [None]:
df.describe()

In [None]:
df[df['target']=='ham'].describe()

In [None]:
df[df['target']=='spam'].describe()

In [None]:
new_features = ['num_char','num_words','num_sentences']
target = ['spam','ham']
# Set style for plots
sns.set(style="whitegrid")

# Create subplots for each feature
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# KDE plot for num_char
sns.kdeplot(data=df, x='num_char', hue='target', fill=True, ax=axes[0])
axes[0].set_title('KDE Plot for num_char')

# KDE plot for num_words
sns.kdeplot(data=df, x='num_words', hue='target', fill=True, ax=axes[1])
axes[1].set_title('KDE Plot for num_words')

# KDE plot for num_sentences
sns.kdeplot(data=df, x='num_sentences', hue='target', fill=True, ax=axes[2])
axes[2].set_title('KDE Plot for num_sentences')

# Show the plots
plt.tight_layout()
plt.show()

In [None]:
sns.heatmap(df.drop(columns = ['target','text']).corr(),annot=True)

In [None]:
df

## Data Preprocessing

## Lowercasing
## Word Tokenize
## Removing characters other than Alphanumeric
## Removing stowords
## Perform Stemming

In [None]:
ps = nltk.stem.porter.PorterStemmer()
lmt = nltk.stem.WordNetLemmatizer()


def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    L1=[]
    for word in text:
        word=re.sub(r'[^a-zA-Z0-9]', '', word)
        if word not in nltk.corpus.stopwords.words('english') and word !="":                        
            L1.append(word)
    text = L1    

    L2 = []
    for item in text:
        L2.append(lmt.lemmatize(item))
          
        
    return " ".join(L2)

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df

## Total number of words in the corpus
## Total number of Unique words in the corpus

In [None]:
counter = 0
v={}
for sent in df['transformed_text']:
    for word in (sent.split()):
        counter = counter + 1
        if word in v:
            v[word] = v[word] + 1
        else:
            v[word] = 0
print('Total number of words in Corpus --> ', counter)
print('Total number of words in Corpus --> ', len(v))
        

## 4. Model Building

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features = 1000)

In [None]:
X = cv.fit_transform(df['transformed_text']).toarray()

In [None]:
X1 = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
y = df['target_encoded']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1,y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred_gnb))
print(confusion_matrix(y_test,y_pred_gnb))
print(precision_score(y_test,y_pred_gnb))

In [None]:
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred_mnb))
print(confusion_matrix(y_test,y_pred_mnb))
print(precision_score(y_test,y_pred_mnb))

In [None]:
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred_bnb))
print(confusion_matrix(y_test,y_pred_bnb))
print(precision_score(y_test,y_pred_bnb))

In [None]:
sample = "Money money money, win free moeny and enjoy ."
txt1 = transform_text(sample)
txt2 = tfidf.transform([txt1])
mnb.predict(txt2)