<a href="https://colab.research.google.com/github/pankaj9309/Panakj_Dhande_Meta_Scifor_Technologies-Jaison_Meta_Scifor_Technologies/blob/main/MiniProject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Importing Libraries
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Downloading NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Loading Dataset
df = pd.read_csv('spam_ham_dataset.csv')
df.drop_duplicates(inplace=True)  # Remove duplicates
df['text'] = df['text'].str.replace('\r\n', ' ', regex=True)  # Clean line breaks

# Preprocessing Text Data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))  # Lowercase and remove punctuation
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]  # Lemmatization and stopword removal
    return ' '.join(words)

df['processed_text'] = df['text'].apply(preprocess_text)

# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_text']).toarray()
y = df['label_num']  # Labels: 1 for spam, 0 for ham

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the RandomForest Classifier
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

# Model Evaluation
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classifying a New Email
def classify_email(email):
    email = preprocess_text(email)
    vectorized_email = vectorizer.transform([email]).toarray()
    prediction = clf.predict(vectorized_email)
    return "Spam" if prediction == 1 else "Ham"

# Example Classification
new_email = "Congratulations! You've won a free prize. Click here to claim."
print("The email is:", classify_email(new_email))