In [1]:
# 📍 1. Imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from pathlib import Path

In [2]:
# 📍 2. Load & Preprocess
df = pd.read_csv("../data/processed/labeled_messages.csv")
df["date_parsed"] = pd.to_datetime(df["date_parsed"], errors="coerce")
df = df.dropna(subset=["date_parsed"])

# Sentiment to numeric
sentiment_map = {"Positive": 1, "Negative": -1, "Neutral": 0}
df["Sentiment_Score"] = df["Sentiment"].map(sentiment_map)

# Message features
df["char_count"] = df["text"].astype(str).apply(len)
df["word_count"] = df["text"].astype(str).apply(lambda x: len(x.split()))
df["Month"] = df["date_parsed"].dt.to_period("M")

In [3]:
# 📍 3. Group & Feature Engineering
monthly_df = df.groupby(["from", "Month"]).agg({
    "text": "count",
    "char_count": "mean",
    "word_count": "mean",
    "Sentiment_Score": "sum"
}).reset_index()

monthly_df = monthly_df.rename(columns={
    "from": "Employee",
    "text": "msg_count",
    "char_count": "avg_msg_length",
    "word_count": "avg_word_count",
    "Sentiment_Score": "sentiment_score"
})


In [10]:
# 📍 4. Train/Test Split
features = ["msg_count", "avg_msg_length", "avg_word_count"]
X = monthly_df[features]
y = monthly_df["sentiment_score"]
print(y.max(), y.min())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

13 -1


In [None]:
# 📍 5. Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# ------------------------------------------------------------
# 📍 6. Evaluation
print("R^2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

# Coefficients
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": model.coef_
})
print(coef_df)

R^2 Score: -0.07987317420466811
RMSE: 2.739820867242146
          Feature  Coefficient
0  avg_msg_length     0.021008
1  avg_word_count    -0.101724
     Actual  Predicted
24        4   3.452596
6         7   3.488799
93        6   3.712600
109       3   3.118647
104       4   3.641014
172       7   3.349026
233       2   2.523649
86        6   3.736564
9         3   2.953552
143       5   3.898417
