## See revisions.docx for all changes

Imports

In [None]:
import pandas as pd
from textblob import TextBlob
from tqdm import tqdm
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import torch
from transformers import pipeline

## FAQ 1. Sentiment Thresholds Should Not Be Arbitrary

Fix Made:
added markdown:
This project uses TextBlob to assign sentiment labels based on polarity scores. A threshold of ±0.1 was used to label Neutral sentiment:
- Polarity > 0.1 → Positive  
- Polarity < –0.1 → Negative  
- Otherwise → Neutral
This threshold was chosen to avoid misclassifying slightly opinionated or ambiguous text as strongly positive or negative. A wider neutral range (−0.1 to 0.1) helps reduce false positives and makes the model more conservative, which is appropriate in a professional email context where extreme sentiment is relatively rare. While higher thresholds (e.g., ±0.2) were considered, they resulted in too many emails being labeled Neutral, losing useful signal for analysis.


Task 1: Sentiment Labeling

In [None]:

# Load data
df = pd.read_csv('./data/raw/test.csv')
print(df.isna().mean().sort_values(ascending=False))
df = df.fillna('') 

# Combine subject and body into a single text column
df['text'] = df.apply(lambda row: row['body'] if row['body'].strip() else row['Subject'], axis=1)
df = df[df['text'].str.strip() != '']

# Label sentiment with TextBlob (original)
def classify_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

tqdm.pandas()
df['Sentiment'] = df['text'].progress_apply(classify_sentiment)

# --------- ADDITION: Transformer-based Sentiment Model ---------
# Load HuggingFace sentiment pipeline (Roberta model)
sentiment_pipe = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", device=0 if torch.cuda.is_available() else -1)

# Classify using the transformer model
def classify_with_roberta(text):
    try:
        result = sentiment_pipe(text[:512])[0]['label']
        # Map labels to match your format
        if result == 'LABEL_2':
            return 'Positive'
        elif result == 'LABEL_0':
            return 'Negative'
        else:
            return 'Neutral'
    except:
        return 'Neutral'

df['Sentiment_Roberta'] = df['text'].progress_apply(classify_with_roberta)

# ---------------------------------------------------------------

# Save labeled data
os.makedirs('./data/processed', exist_ok=True)
df.to_csv('./data/processed/labeled_messages.csv', index=False)

# Print value counts of both models
print("TextBlob Sentiment Distribution:")
print(df['Sentiment'].value_counts())
print("\nRoberta Sentiment Distribution:")
print(df['Sentiment_Roberta'].value_counts())

# Filter where the two sentiment labels disagree
disagreements = df[df['Sentiment'] != df['Sentiment_Roberta']]
# Display 7 sample disagreements
sample_disagreements = disagreements.sample(7, random_state=42)
# Print text and both sentiment values
for idx, row in sample_disagreements.iterrows():
    print(f"--- Sample {idx} ---")
    print("Text:")
    print(row['text'])
    print("\nTextBlob Sentiment:", row['Sentiment'])
    print("Roberta Sentiment:", row['Sentiment_Roberta'])
    print("\n" + "="*80 + "\n")



## FAQ 2. Don’t Rely on One Sentiment Tool (Without Validation)

Fix Made:
Initially, I used only TextBlob for sentiment analysis. I added a second column of predictions using a transformer-based Roberta model (cardiffnlp/twitter-roberta-base-sentiment). I then compared the two outputs side by side.

The predictions of the models side by side:
TextBlob Sentiment Distribution:
Sentiment
Neutral     1053
Positive     974
Negative     164
Name: count, dtype: int64

Roberta Sentiment Distribution:
Sentiment_Roberta
Neutral     1485
Positive     558
Negative     148
Name: count, dtype: int64

This discrepancy suggests that TextBlob tends to assign more messages as positive, while Roberta is more conservative, possibly due to domain differences.

TextBlob tended to label more messages at a more sensitive scale, especially on the positive side, while Roberta more frequently predicted Neutral. After manually reviewing the messages with differing predictions, I found that Roberta’s outputs were generally more aligned with the actual tone of the emails.

Conclusion:
I added this second model to validate the accuracy of the initial tool. Roberta’s predictions were found to be more appropriate for our formal business email dataset, and aligns with FAQ guidance to avoid relying on a single model.


Task 2: Exploratory Data Analysis

In [None]:
# Plot style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# Paths
RAW_DIR = Path("./data/raw")
PROC_DIR = Path("./data/processed")
VIZ_DIR = Path("./visualization")
VIZ_DIR.mkdir(exist_ok=True)

#load saved data
df = pd.read_csv(PROC_DIR / "labeled_messages.csv")
print("Rows:", len(df))
df.head()

#date cleaning & parsing
def clean_date(x):
    """Convert '########' or empty strings to NaT, else parse M/D/YYYY."""
    if isinstance(x, str) and x.strip().startswith("#"):
        return pd.NaT
    try:
        return pd.to_datetime(x, format="%m/%d/%Y", errors="coerce")
    except Exception:
        return pd.NaT

df["date_parsed"] = df["date"].apply(clean_date)
#save
df.to_csv(PROC_DIR / "labeled_messages.csv", index=False)

print("Date parsing success rate:", df["date_parsed"].notna().mean())
missing_summary = df.isna().mean().rename("missing_ratio").to_frame()
display(missing_summary)

#sentiment distribution
sns.countplot(x="Sentiment", data=df, order=["Positive", "Neutral", "Negative"])
plt.title("Sentiment Distribution")
plt.tight_layout()
plt.savefig(VIZ_DIR / "sentiment_distribution.png")
plt.close()

#time series sentiment trend
df_time = (
    df.dropna(subset=["date_parsed"])
      .assign(month=lambda d: d["date_parsed"].dt.to_period("M"))
      .groupby(["month", "Sentiment"])
      .size()
      .unstack(fill_value=0)
)

df_time.plot(kind="bar", stacked=False)
plt.title("Monthly Message Count by Sentiment")
plt.xlabel("Month")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(VIZ_DIR / "monthly_sentiment_counts.png")
plt.close()

#top senders by message volume
top_senders = (
    df.groupby("from")["text"].count().sort_values(ascending=False).head(10)
)
sns.barplot(y=top_senders.index, x=top_senders.values, orient="h")
plt.title("Top 10 Employees by # Messages")
plt.xlabel("Message Count")
plt.tight_layout()
plt.savefig(VIZ_DIR / "top_senders.png")
plt.close()

#data summary:
# Shape and column names
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)

# Data types
print("\nData types:\n", df.dtypes)

# Missing value ratio
missing_summary = df.isna().mean().rename("missing_ratio").to_frame()
print("\nMissing ratio per column:\n", missing_summary)

# Distribution of sentiment values
print(df['Sentiment'].value_counts())
print("\nSentiment distribution (normalized):\n", df['Sentiment'].value_counts(normalize=True))

# Drop NA dates and convert to month
df_time = (
    df.dropna(subset=["date_parsed"])
      .assign(month=lambda d: d["date_parsed"].dt.to_period("M"))
      .groupby(["month", "Sentiment"])
      .size()
      .unstack(fill_value=0)
)

print("\nMonthly sentiment trend:\n", df_time.tail(12))  # last 12 months, for brevity

# Who sends the most messages?
top_senders = df['from'].value_counts().head(10)
print("\nTop 10 employees by message count:\n", top_senders)

# Check if any senders only send negative messages
only_negative = df[df["Sentiment"] == "Negative"]["from"].value_counts()
print("\nEmployees with most negative messages:\n", only_negative.head(10))




## FAQ 3. Charts Without Interpretation Are Not Insightful

Fix Made: 
each chart interpreted with markdown:
# Revision
Sentiment was distributed heavily toward Positive and Neutral, with approximately 1,000 emails each. In contrast, only about 150 emails were labeled Negative, indicating that negative sentiment is relatively rare in the dataset. This imbalance suggests that most employee communications maintain a neutral or constructive tone, which is typical in professional environments.
# Revision
Sentiment was distributed fairly evenly across most months, suggesting a consistent tone in employee communications throughout the year. However, there was a slight dip in sentiment during January 2010, with a noticeable increase in negative emails — approximately 14 messages labeled as Negative. This could indicate a temporary period of dissatisfaction or tension.
# Revision
The top 10 most active senders include two individuals with approximately 175 messages each, six with around 225 messages, one with 250, and one with nearly 275 messages. This distribution indicates that one small subset of individuals is responsible for a slightly larger portion of total communication, and another is responsible for a slightly smaller portion. The two high-volume senders may hold key roles in the organization—such as management, team leads, or coordinators—and could significantly influence the overall tone and sentiment in the dataset. Their communication patterns are critical to monitor, as changes in their sentiment or volume might reflect broader shifts within their teams or departments.


## FAQ 4. Avoid Inventing Metrics Without Rationale

No changes needed: 
Did not create custom metrics. The sentiment score is a simple sum of labeled values. 
The monthly sentiment score directly aggregates labeled values with no weighting or scaling.


Task 3: Employee Score Calculation

In [None]:
# Load Labeled Data
df = pd.read_csv("./data/processed/labeled_messages.csv")
def clean_date(x):
    """Convert '########' or empty strings to NaT, else parse M/D/YYYY."""
    if isinstance(x, str) and x.strip().startswith("#"):
        return pd.NaT
    try:
        return pd.to_datetime(x, format="%m/%d/%Y", errors="coerce")
    except Exception:
        return pd.NaT

df["date_parsed"] = df["date"].apply(clean_date)
#save parsed date column
df.to_csv("./data/processed/labeled_messages.csv", index=False)
df = df.dropna(subset=["date_parsed"])  # Drop rows without date

#map scores
sentiment_map = {"Positive": 1, "Negative": -1, "Neutral": 0}
df["Sentiment_Score"] = df["Sentiment"].map(sentiment_map)

df["YearMonth"] = df["date_parsed"].dt.to_period("M")

#combine monthly scores
monthly_scores = (
    df.groupby(["from", "YearMonth"])["Sentiment_Score"]
    .sum()
    .reset_index()
    .rename(columns={"from": "Employee", "YearMonth": "Month", "Sentiment_Score": "Score"})
)

#save data
print(monthly_scores.head())
monthly_scores.to_csv("./data/processed/monthly_sentiment_scores.csv", index=False)

Task 4: Employee ranking

In [None]:
# Load Monthly Scores
df = pd.read_csv("./data/processed/monthly_sentiment_scores.csv")
df["Month"] = pd.PeriodIndex(df["Month"], freq="M")

# ------------------------------------------------------------
# 3. Define Ranking Logic
def get_rankings(group):
    top = (
        group.sort_values(by=["Score", "Employee"], ascending=[False, True])
             .head(3)
             .assign(Rank_Type="Top Positive")
    )
    bottom = (
        group.sort_values(by=["Score", "Employee"], ascending=[True, True])
             .head(3)
             .assign(Rank_Type="Top Negative")
    )
    return pd.concat([top, bottom])

# ------------------------------------------------------------
# Apply Ranking Per Month
rankings = df.groupby("Month", group_keys=False).apply(get_rankings).reset_index(drop=True)

# Preview
print(rankings.head(10))

# ------------------------------------------------------------
# Save Rankings to File
rankings.to_csv("./data/processed/monthly_employee_rankings.csv", index=False)

#get overall top positive and negative employees
rankings = pd.read_csv("./data/processed/monthly_employee_rankings.csv")

# Assign +1 for Top Positive, ‑1 for Top Negative
rankings["point"] = rankings["Rank_Type"].map({"Top Positive": 1, "Top Negative": -1})

# Aggregate points across all months
overall_scores = (
    rankings.groupby("Employee")["point"]
    .sum()
    .reset_index()
    .rename(columns={"point": "Overall_Score"})
)

# Sort for global Top Positive (highest) and Top Negative (lowest)
top_global_positive = (
    overall_scores.sort_values(by=["Overall_Score", "Employee"], ascending=[False, True])
    .head(3)
    .assign(Global_Rank="Top Positive")
)

top_global_negative = (
    overall_scores.sort_values(by=["Overall_Score", "Employee"], ascending=[True, True])
    .head(3)
    .assign(Global_Rank="Top Negative")
)

global_top3 = pd.concat([top_global_positive, top_global_negative])
print(global_top3)

# ------------------------------------------------------------
# Save Global Rankings
global_top3.to_csv("./data/processed/global_top3_employees.csv", index=False)

Task 5: Flight Risk

In [None]:
#read data
df = pd.read_csv("./data/processed/labeled_messages.csv")
df["date_parsed"] = pd.to_datetime(df["date_parsed"], errors="coerce")

# Keep only negative messages with a valid date
df_neg = df[(df["Sentiment"] == "Negative") & (df["date_parsed"].notna())]
df_neg = df_neg.sort_values(["from", "date_parsed"])

# ------------------------------------------------------------
# Identify Rolling 30-day Negative Message Clusters
def flag_risk(group):
    risk_dates = []
    dates = group["date_parsed"].tolist()
    for i in range(len(dates)):
        count = 1
        start = dates[i]
        for j in range(i+1, len(dates)):
            if (dates[j] - start).days <= 30:
                count += 1
            else:
                break
        if count >= 4:
            risk_dates.append(start)
    return pd.Series({"At_Risk": len(risk_dates) > 0})

risk_flags = df_neg.groupby("from").apply(flag_risk).reset_index()
risk_flags = risk_flags.rename(columns={"from": "Employee"})

# ------------------------------------------------------------
# Output & Save
print(risk_flags[risk_flags["At_Risk"] == True])
risk_flags.to_csv("./data/processed/flight_risk_employees.csv", index=False)


## FAQ 6. Thoughtful Feature Selection in Modeling

No changes needed:
Features chosen were message count, average length, and average word count, based on proven logical relevance to emotional tone and frequency of communication.


Task 6: Linear Regression Model

In [None]:
#imports
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# load and preprocess data
df = pd.read_csv("./data/processed/labeled_messages.csv")
df["date_parsed"] = pd.to_datetime(df["date_parsed"], errors="coerce")
df = df.dropna(subset=["date_parsed"])

# Sentiment to numeric
sentiment_map = {"Positive": 1, "Negative": -1, "Neutral": 0}
df["Sentiment_Score"] = df["Sentiment"].map(sentiment_map)

# Message features
df["char_count"] = df["text"].astype(str).apply(len)
df["word_count"] = df["text"].astype(str).apply(lambda x: len(x.split()))
df["Month"] = df["date_parsed"].dt.to_period("M")

# Group & Feature Engineering
monthly_df = df.groupby(["from", "Month"]).agg({
    "text": "count",
    "char_count": "mean",
    "word_count": "mean",
    "Sentiment_Score": "sum"
}).reset_index()

monthly_df = monthly_df.rename(columns={
    "from": "Employee",
    "text": "msg_count",
    "char_count": "avg_msg_length",
    "word_count": "avg_word_count",
    "Sentiment_Score": "sentiment_score"
})

# Train/Test Split
features = ["msg_count", "avg_msg_length", "avg_word_count"]
X = monthly_df[features]
y = monthly_df["sentiment_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# ------------------------------------------------------------
# Evaluation
print("R^2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

# Coefficients
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": model.coef_
})
print(coef_df)

## FAQ 7. Don’t Just Print R² and MSE — Interpret Them

Fix Made: 
added markdown:
The R² score of 0.42 indicates that approximately 42% of the variance in the target variable (e.g., monthly sentiment score) is explained by the model's input features. While this reflects a moderate level of explanatory power, over half of the variability remains unexplained, suggesting that either the current features lack sufficient predictive strength or that sentiment patterns are influenced by external factors not captured in the dataset.
The RMSE of 1.99 means that on average, the model's predictions deviate from the actual sentiment score by nearly 2 units, a moderate amount considering that it ranges from -1 to 12 throughout the dataset. This suggests that while the model can recognize some broad trends, it struggles to make precise predictions at the individual message or monthly level, and needs further refinement through feature selection.
