In [None]:

!pip install pdfplumber

from google.colab import drive
import pdfplumber
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

drive.mount('/content/drive')

pdf_path = '/content/drive/MyDrive/cipherbyte/spam/Spam Email Detection - spam.pdf'

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf(pdf_path)

print(pdf_text[:1000])

lines = pdf_text.splitlines()

df = pd.DataFrame([line.split() for line in lines])

print(df.head())
print(df.columns)

df.columns = [f'v{i+1}' for i in range(len(df.columns))]

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

df['v2'] = df['v2'].apply(preprocess_text)


X = df['v2']
y = df['v1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

def predict_spam(text):
    text = preprocess_text(text)
    text_vec = vectorizer.transform([text])
    return model.predict(text_vec)[0]

new_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."
print(predict_spam(new_email))