In [None]:
"""
This script uses the emails.csv dataset, found in the DATA folder of the DS4002_CS3 repository, 
to clean and preprocess textual email data. You will have to write the preprocessing code. Use the 
materials in the REFERENCE FILES folder of the repository if you need help filling in the code.
This script also performs some exploratory data analysis, providing informative graphs and tables that 
reflect relationships between key variables. It is imperative to run this file to preprocess the 
textual data before performing any further analyses/running ML algorithms on the data. 
"""

In [None]:
# Import packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
#from sklearn.preprocessing import LabelEncoder

In [None]:
# Change file path if necessary
df = pd.read_csv("emails.csv")
df.head()

In [None]:
# Replace spam labels for clarity
df = df.replace({'spam': {1: 'Spam', 0: 'Not Spam'}})

# Rename columns for clarity
df = df.rename(columns={'spam': 'Spam or Not Spam', 'text': 'Email Text'})

# Displaying basic dataset info
print("Dataset Shape:", df.shape)
print("Column Names:", df.columns)
print("\nFirst 5 rows:")
print(df.head())

# Check for missing vals
print("\nMissing Values:")
print(df.isnull().sum())

# Summary of text lengths
df["Text Length"] = df["Email Text"].apply(lambda x: len(str(x)))
print("\nText Length Statistics:")
print(df["Text Length"].describe())

In [None]:
df.head()

In [None]:
df['Spam or Not Spam'].value_counts()

# Percentage of Emails that are Spam and Not Spam
plt.figure(figsize=(8, 6))
plt.pie(df['Spam or Not Spam'].value_counts(), labels=df['Spam or Not Spam'].value_counts().index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Spam and Not Spam Emails')
plt.axis('equal') 
plt.show()

In [None]:
# Visualize distribution of email lengths
plt.figure(figsize=(8, 5))
sns.histplot(df["Text Length"], bins=30, kde=True)
plt.title("Distribution of Email Lengths")
plt.xlabel("Number of Characters")
plt.ylabel("Frequency")
plt.show()

In [None]:
avg_length_spam = df[df['Spam or Not Spam'] == 'Spam']['Text Length'].mean()
avg_length_ham = df[df['Spam or Not Spam'] == 'Not Spam']['Text Length'].mean()

# Average Character Count of Spam and Not Spam Emails
print("Average Character Count of Spam Emails:", avg_length_spam)
print("Average Character Count of Not Spam Emails:", avg_length_ham)
plt.bar(['Spam', 'Not Spam'], [avg_length_spam, avg_length_ham], color=['red', 'Blue'])
plt.title('Average Character Count in Spam vs. Not Spam')
plt.xlabel('Email Type')
plt.ylabel('Average Character Count')
plt.show()

In [None]:
df['Word Count'] = df['Email Text'].apply(nltk.word_tokenize).apply(len)
avg_words_spam = df[df['Spam or Not Spam'] == 'Spam']['Word Count'].mean()
avg_words_ham = df[df['Spam or Not Spam'] == 'Not Spam']['Word Count'].mean()

# Average Word Count in Spam and Not Spam Emails
print("Average Words of Spam Emails:", avg_words_spam)
print("Average Words of Not Spam Emails:", avg_words_ham)
plt.bar(['Spam', 'Not Spam'], [avg_words_spam, avg_words_ham], color=['red', 'Blue'])
plt.title('Average Word Count in Spam vs. Not Spam')
plt.xlabel('Email Type')
plt.ylabel('Average Word Count')
plt.show()

In [None]:
df['Sentence Count'] = df['Email Text'].apply(nltk.sent_tokenize).apply(len)
avg_sentence_spam = df[df['Spam or Not Spam'] == 'Spam']['Sentence Count'].mean()
avg_sentence_ham = df[df['Spam or Not Spam'] == 'Not Spam']['Sentence Count'].mean()

# Average Sentence Count in Spam and Not Spam Emails
print("Average Sentences of Spam Emails:", avg_sentence_spam)
print("Average Sentences of Not Spam Emails:", avg_sentence_ham)
plt.bar(['Spam', 'Not Spam'], [avg_sentence_spam, avg_sentence_ham], color=['red', 'Blue'])
plt.title('Average Sentence Count in Spam vs. Not Spam')
plt.xlabel('Email Type')
plt.ylabel('Average Sentence Count')
plt.show()

In [None]:
# TEXT PREPROCESSING 

# This is the most important step in analyzing textual data! Without properly cleaned/preprocessed data, your 
# models may not run or they may be extremely ineffective. 

In [None]:
# In the spaces below, write the following code:

In [None]:
# Convert text to lowercase 




In [None]:
# Tokenize sentences 




In [None]:
# Remove stop words and special characters using the following list of stop words 
stop_words = nltk.corpus.stopwords.words('english')
# Adds 'subject' to the list of stopwords since all email text contains the word 'subject'
stop_words.append('subject')




In [None]:
# Stemming - convert words to their root form for standardization




In [None]:
# Convert the preprocessed text back to a string




In [None]:
df.head()

In [None]:
# Spam word frequency analysis 

spam_emails = df[df['Spam or Not Spam'] == 'Spam']['Preprocessed Text']
# Tokenize the text in spam emails
spam_words = ' '.join(spam_emails).split()
# Count occurrences of each word
word_counts = Counter(spam_words)
# Find the most common words
most_common_words = word_counts.most_common(10)
print("Top 10 Most Common Words in Spam Emails:", most_common_words)

In [None]:
# Not Spam word frequency analysis 

not_spam_emails = df[df['Spam or Not Spam'] == 'Not Spam']['Preprocessed Text']
# Tokenize the text in spam emails
not_spam_words = ' '.join(not_spam_emails).split()
# Count occurrences of each word
word_counts = Counter(not_spam_words)
# Find the most common words
most_common_words_ham = word_counts.most_common(10)
print("Top 10 Most Common Words in Spam Emails:", most_common_words_ham)

In [None]:
# Word cloud visualization for spam emails
spam_text = " ".join(spam_words)
wordcloud_spam = WordCloud(width=800, height=400, background_color="black").generate(spam_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_spam, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud for Spam Emails")
plt.show()

In [None]:
# Word cloud visualization for non-spam emails
not_spam_text = " ".join(not_spam_words)
wordcloud_not_spam = WordCloud(width=800, height=400, background_color="white").generate(not_spam_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_not_spam, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud for Non-Spam Emails")
plt.show()

In [None]:
# Save dataframe with preprocessed text
df.to_csv('preprocessed_data.csv', index = False)