# 02 - Exploratory Data Analysis (EDA)

### Description:
This notebook loads the preprocessed data and performs exploratory data analysis (EDA) to understand key patterns, including word frequency and email length distribution.

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
# Load preprocessed data
cleaned_data = pd.read_csv('data/processed/cleaned_data.csv')

In [None]:
# Generate a word cloud from the cleaned text data
text = ' '.join([' '.join(tokens) for tokens in cleaned_data['clean_text']])
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)

In [None]:
# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Plot distribution of email lengths
email_lengths = cleaned_data['clean_text'].apply(len)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(email_lengths, bins=20, color='blue', edgecolor='black')
plt.title('Distribution of Email Lengths (in tokens)')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()