# 04_Component3_Persona_Clustering.ipynb



## 1. Setup & Imports


In [1]:
# Data manipulation & IO
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# NLP for skill‐keywords (if used)
from joblib import load

# Dimensionality reduction & clustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Scaling & model persistence
from sklearn.preprocessing import StandardScaler
from joblib import dump

sns.set(style="whitegrid")


## 2. Load Inputs


In [6]:
# 2.1 Load gap metrics from Notebook 2
gap_df = pd.read_excel("gap_analysis_metrics.xlsx")
# columns: id, aspect, exp_mean_sent, exp2_mean_sent, semantic_gap, sentiment_gap, hybrid_gap, overall_satisfaction

# 2.2 Load ABSA sentiment data from Notebook 2
absa_df = pd.read_excel("absa_aspect_sentiment.xlsx")
# columns: id, term, category, polarity, text

# 2.3 Optionally load cleaned raw for quotes or skills
clean = pd.read_excel("cleaned_feedback_preprocessed.xlsx")


## 3. Feature Engineering


In [7]:
# --- 3. Feature Engineering ---

# 3.0 Reset indices to create a stable 'id' column in each DF
absa_df = absa_df.reset_index(drop=True).reset_index().rename(columns={'index':'id'})
gap_df  = gap_df.reset_index(drop=True).reset_index().rename(columns={'index':'id'})
clean   = clean.reset_index(drop=True).reset_index().rename(columns={'index':'id'})

# Verify uniqueness on the 'id' Series
assert absa_df['id'].is_unique, "absa_df id values are not unique!"
assert gap_df['id'].is_unique,  "gap_df id values are not unique!"
assert clean['id'].is_unique,   "clean id values are not unique!"

# 3.1 Pivot ABSA → average polarity per aspect per intern
absa_pivot = (
    absa_df
      .groupby(['id','category'])['polarity']
      .mean()
      .unstack(fill_value=0)
      .add_prefix('sent_')
)

# 3.2 Pivot gap_df → sentiment_gap, semantic_gap, hybrid_gap per aspect per intern
gap_sent = (
    gap_df
      .pivot(index='id', columns='aspect', values='sentiment_gap')
      .add_prefix('gap_sent_')
      .fillna(0)
)
gap_sem = (
    gap_df
      .pivot(index='id', columns='aspect', values='semantic_gap')
      .add_prefix('gap_sem_')
      .fillna(0)
)
gap_hyb = (
    gap_df
      .pivot(index='id', columns='aspect', values='hybrid_gap')
      .add_prefix('gap_hyb_')
      .fillna(0)
)

# 3.3 Aspect mention frequencies per intern
freq = (
    absa_df
      .groupby(['id','category'])
      .size()
      .unstack(fill_value=0)
      .add_prefix('freq_')
)

# 3.4 Skill‐keyword indicators
skills = ["python","git","agile","docker","aws","java","sql","javascript"]
skill_flags = clean[['id']].copy()
for skill in skills:
    skill_flags[f"skill_{skill}"] = clean['feedback_text_clean'].apply(
        lambda toks: int(isinstance(toks, list) and skill in toks)
    )
skill_flags = skill_flags.set_index('id')

# 3.5 Merge all feature sets
features = (
    absa_pivot
      .join([gap_sent, gap_sem, gap_hyb, freq, skill_flags], how='outer')
      .fillna(0)
)
features.index.name = 'id'

# 3.6 Add overall satisfaction if available
if 'overall_satisfaction' in clean.columns:
    overall = clean.set_index('id')['overall_satisfaction']
    features = features.join(overall, how='left')

print("Feature matrix shape:", features.shape)
features.head()


AttributeError: 'DataFrame' object has no attribute 'is_unique'

## 4. Scaling & Dimensionality Reduction


In [None]:
# 4.1 Separate features vs. labels
X = features.drop(columns=['overall_satisfaction'], errors='ignore')
ids = X.index

# 4.2 Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save scaler for future use
dump(scaler, "models/persona_scaler.joblib")

# 4.3 PCA (for diagnostics)
pca = PCA(n_components=0.90, random_state=42)  # keep 90% variance
X_pca = pca.fit_transform(X_scaled)
print(f"PCA reduces to {X_pca.shape[1]} dimensions")

# 4.4 t-SNE for 2D visualization
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)


### 4.5 Visualize t-SNE embedding


In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X_tsne[:,0], X_tsne[:,1], s=20, alpha=0.6)
plt.title("t-SNE Projection of Intern Feature Vectors")
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
plt.show()


## 5. Clustering & Validation


In [None]:
# 5.1 Determine optimal k with elbow and silhouette
wcss, sil, ks = [], [], list(range(2,8))
for k in ks:
    km = KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_scaled)
    wcss.append(km.inertia_)
    sil.append(silhouette_score(X_scaled, km.labels_))

# 5.2 Plot
fig, ax1 = plt.subplots(figsize=(8,4))
ax1.plot(ks, wcss, '-o', label='Inertia (WCSS)')
ax1.set_xlabel('k')
ax1.set_ylabel('WCSS')
ax2 = ax1.twinx()
ax2.plot(ks, sil, '-o', color='C1', label='Silhouette')
ax2.set_ylabel('Silhouette Score')
fig.legend(loc='upper right')
plt.title("Elbow & Silhouette for k-means")
plt.show()

# 5.3 Choose k (e.g., k=4)
k_opt = 4
km = KMeans(n_clusters=k_opt, random_state=42, n_init=10).fit(X_scaled)
labels = km.labels_
features['persona'] = labels

# Save model
dump(km, "models/persona_kmeans.joblib")


## 6. Persona Visualization


In [None]:
# 6.1 t-SNE colored by persona
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=labels, palette='tab10', s=30, alpha=0.7)
plt.title("t-SNE of Interns Colored by Persona Cluster")
plt.legend(title="Persona")
plt.show()

# 6.2 Silhouette for chosen k
print("Silhouette Score (k=4):", silhouette_score(X_scaled, labels))
print("Davies–Bouldin Index (k=4):", davies_bouldin_score(X_scaled, labels))


## 7. Persona Profiling & Export


In [None]:
# 7.1 Compute cluster centroids in original feature space
centroids = pd.DataFrame(km.cluster_centers_, columns=X.columns)
centroids = pd.DataFrame(scaler.inverse_transform(centroids), columns=X.columns)
centroids['persona'] = range(k_opt)

# 7.2 Representative quotes per persona
reps = []
for p in range(k_opt):
    ids_p = features[features['persona']==p].index
    # sample up to 3 quotes from raw feedback
    quotes = clean.loc[ids_p, 'feedback_text']\
                 .dropna().sample(3, random_state=42).tolist()
    reps.append({'persona':p, 'quotes':quotes})

reps_df = pd.DataFrame(reps).set_index('persona')

# 7.3 Combine into persona profiles
persona_profiles = centroids.join(reps_df, on='persona')
persona_profiles.to_excel("persona_profiles.xlsx")

# 7.4 Show summary
persona_profiles.head()


## 8. Recommendations Placeholder
