In [38]:
import pandas as pd 
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [25]:
# Assuming df is your DataFrame and 'emails' is the column with email text
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


True

In [10]:
df = pd.read_csv("emails.csv")

In [11]:
df

Unnamed: 0,Text,Spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [12]:
df.isna().sum()

Text    0
Spam    0
dtype: int64

In [15]:
def preprocess_email(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    # Tokenization
    words = word_tokenize(text)
    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [16]:
# Apply the preprocessing function to your email column
df['cleaned_Text'] = df['Text'].apply(preprocess_email)

In [17]:
df

Unnamed: 0,Text,Spam,cleaned_Text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible corporate ident...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im wa...
3,Subject: 4 color printing special request add...,1,subject 4 color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,subject money get software cds software compat...
...,...,...,...
5723,Subject: re : research and development charges...,0,subject research development charges gpg forwa...
5724,"Subject: re : receipts from visit jim , than...",0,subject receipts visit jim thanks invitation v...
5725,Subject: re : enron case study update wow ! a...,0,subject enron case study update wow day super ...
5726,"Subject: re : interest david , please , call...",0,subject interest david please call shirley cre...


In [27]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df['lemmatized_Text'] = df['cleaned_Text'].apply(lemmatize_words)


In [28]:
df

Unnamed: 0,Text,Spam,cleaned_Text,lemmatized_Text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible corporate ident...,subject naturally irresistible corporate ident...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im wa...,subject unbelievable new home made easy im wan...
3,Subject: 4 color printing special request add...,1,subject 4 color printing special request addit...,subject 4 color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,subject money get software cds software compat...,subject money get software cd software compati...
...,...,...,...,...
5723,Subject: re : research and development charges...,0,subject research development charges gpg forwa...,subject research development charge gpg forwar...
5724,"Subject: re : receipts from visit jim , than...",0,subject receipts visit jim thanks invitation v...,subject receipt visit jim thanks invitation vi...
5725,Subject: re : enron case study update wow ! a...,0,subject enron case study update wow day super ...,subject enron case study update wow day super ...
5726,"Subject: re : interest david , please , call...",0,subject interest david please call shirley cre...,subject interest david please call shirley cre...


In [31]:
# Initializing TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed

# Fit and transform the cleaned email text
x = tfidf_vectorizer.fit_transform(df['lemmatized_Text']).toarray()

In [33]:
y = df['Spam']

In [36]:
x_train, x_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [40]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can tune the n_estimators and other parameters

# Train the classifier
rf_classifier.fit(x_train, y_train)

In [45]:
# Predict on the test set
y_pred = rf_classifier.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# More detailed performance analysis
print(classification_report(y_test, y_pred))

Accuracy: 0.9869109947643979
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       856
           1       0.99      0.96      0.97       290

    accuracy                           0.99      1146
   macro avg       0.99      0.98      0.98      1146
weighted avg       0.99      0.99      0.99      1146

