1. imports & settings

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Plot style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# Paths
RAW_DIR = Path("../data/raw")
PROC_DIR = Path("../data/processed")
VIZ_DIR = Path("../visualization")
VIZ_DIR.mkdir(exist_ok=True)

2. load saved data

In [2]:
df = pd.read_csv(PROC_DIR / "labeled_messages.csv")
print("Rows:", len(df))
df.head()

Rows: 2191


Unnamed: 0,Subject,body,date,from,text,Sentiment
0,EnronOptions Update!,EnronOptions Announcement\n\n\nWe have updated...,5/10/2010,sally.beck@enron.com,EnronOptions Announcement\n\n\nWe have updated...,Positive
1,(No Subject),"Marc,\n\nUnfortunately, today is not going to ...",7/29/2010,eric.bass@enron.com,"Marc,\n\nUnfortunately, today is not going to ...",Neutral
2,Phone Screen Interview - Shannon L. Burnham,"When: Wednesday, June 06, 2001 10:00 AM-11:00 ...",7/25/2011,sally.beck@enron.com,"When: Wednesday, June 06, 2001 10:00 AM-11:00 ...",Neutral
3,RE: My new work email,we were thinking papasitos (we can meet somewh...,3/25/2010,johnny.palmer@enron.com,we were thinking papasitos (we can meet somewh...,Negative
4,Bet,Since you never gave me the $20 for the last t...,5/21/2011,lydia.delgado@enron.com,Since you never gave me the $20 for the last t...,Neutral


date cleaning & parsing

In [4]:
def clean_date(x):
    """Convert '########' or empty strings to NaT, else parse M/D/YYYY."""
    if isinstance(x, str) and x.strip().startswith("#"):
        return pd.NaT
    try:
        return pd.to_datetime(x, format="%m/%d/%Y", errors="coerce")
    except Exception:
        return pd.NaT

df["date_parsed"] = df["date"].apply(clean_date)

print("Date parsing success rate:", df["date_parsed"].notna().mean())
missing_summary = df.isna().mean().rename("missing_ratio").to_frame()
display(missing_summary)

Date parsing success rate: 1.0


Unnamed: 0,missing_ratio
Subject,0.0
body,0.0
date,0.0
from,0.0
text,0.0
Sentiment,0.0
date_parsed,0.0


sentiment distribution

In [5]:
sns.countplot(x="Sentiment", data=df, order=["Positive", "Neutral", "Negative"])
plt.title("Sentiment Distribution")
plt.tight_layout()
plt.savefig(VIZ_DIR / "sentiment_distribution.png")
plt.close()

time series sentiment trend

In [6]:
df_time = (
    df.dropna(subset=["date_parsed"])
      .assign(month=lambda d: d["date_parsed"].dt.to_period("M"))
      .groupby(["month", "Sentiment"])
      .size()
      .unstack(fill_value=0)
)

df_time.plot(kind="bar", stacked=False)
plt.title("Monthly Message Count by Sentiment")
plt.xlabel("Month")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(VIZ_DIR / "monthly_sentiment_counts.png")
plt.close()

top senders by message volume

In [7]:
top_senders = (
    df.groupby("from")["text"].count().sort_values(ascending=False).head(10)
)
sns.barplot(y=top_senders.index, x=top_senders.values, orient="h")
plt.title("Top 10 Employees by # Messages")
plt.xlabel("Message Count")
plt.tight_layout()
plt.savefig(VIZ_DIR / "top_senders.png")
plt.close()

The dataset contains 2,191 rows and 7 columns.
All columns are of type object, except for date_parsed, which is a datetime64. 
The sentiments distribute heavily, almost equally, towards positive and neutral, and very lightly towards negative.
Monthly sentiment trends can now be explored in visualizations (e.g., bar plots) to detect engagement fluctuations over time. A few employees stood out as high-volume senders. These employees may warrant further review in scoring and flight risk analysis.

In [9]:
# Shape and column names
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)

# Data types
print("\nData types:\n", df.dtypes)

# Missing value ratio
missing_summary = df.isna().mean().rename("missing_ratio").to_frame()
print("\nMissing ratio per column:\n", missing_summary)

# Distribution of sentiment values
print(df['Sentiment'].value_counts())
print("\nSentiment distribution (normalized):\n", df['Sentiment'].value_counts(normalize=True))

# Drop NA dates and convert to month
df_time = (
    df.dropna(subset=["date_parsed"])
      .assign(month=lambda d: d["date_parsed"].dt.to_period("M"))
      .groupby(["month", "Sentiment"])
      .size()
      .unstack(fill_value=0)
)

print("\nMonthly sentiment trend:\n", df_time.tail(12))  # last 12 months, for brevity

# Who sends the most messages?
top_senders = df['from'].value_counts().head(10)
print("\nTop 10 employees by message count:\n", top_senders)

# Check if any senders only send negative messages
only_negative = df[df["Sentiment"] == "Negative"]["from"].value_counts()
print("\nEmployees with most negative messages:\n", only_negative.head(10))



Shape: (2191, 7)

Columns:
 Index(['Subject', 'body', 'date', 'from', 'text', 'Sentiment', 'date_parsed'], dtype='object')

Data types:
 Subject                object
body                   object
date                   object
from                   object
text                   object
Sentiment              object
date_parsed    datetime64[ns]
dtype: object

Missing ratio per column:
              missing_ratio
Subject                0.0
body                   0.0
date                   0.0
from                   0.0
text                   0.0
Sentiment              0.0
date_parsed            0.0
Sentiment
Neutral     1053
Positive     974
Negative     164
Name: count, dtype: int64

Sentiment distribution (normalized):
 Sentiment
Neutral     0.480602
Positive    0.444546
Negative    0.074852
Name: proportion, dtype: float64

Monthly sentiment trend:
 Sentiment  Negative  Neutral  Positive
month                                 
2011-01           4       47        40
2011-02           8