# 03_Component2_Gap_Analysis.ipynb


## 1. Environment Setup & Imports


In [None]:
# Data
import pandas as pd
import numpy as np

# NLP & embeddings
import spacy
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities
import json
from joblib import load, dump

sns.set(style="whitegrid")


## 2. Load Data & Models


In [None]:
# 2.1 Preprocessed feedback
df = pd.read_excel("cleaned_feedback_preprocessed.xlsx")

# 2.2 Aspect ontology
with open("config/aspect_ontology.json") as f:
    ontology = json.load(f)
aspect_categories = list(ontology.keys())

# 2.3 Sentence classifier (expectation vs. experience vs. other)
sent_clf = pipeline(
    "text-classification",
    model="path/to/fine-tuned-sentence-classifier",   # e.g., RoBERTa fine-tuned
    tokenizer="path/to/fine-tuned-sentence-classifier",
    return_all_scores=False
)

# 2.4 Sentiment classifier (reuse from Component 1)
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="path/to/fine-tuned-absa-sentiment",
    tokenizer="path/to/fine-tuned-absa-sentiment"
)

# 2.5 Embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 2.6 spaCy for sentence segmentation
nlp = spacy.load("en_core_web_sm")

# 2.7 Text preprocessing function (from Notebook 1)
from joblib import load
preprocess_text = load("models/preprocess_text.fn")


## 3. Helper Functions


In [None]:
def segment_sentences(text):
    """Split raw text into sentences via spaCy."""
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def classify_sentence_type(sent):
    """Return 'expect', 'experience', or 'other'."""
    # model returns label e.g. 'EXPECTATION', 'EXPERIENCE', 'OTHER'
    lab = sent_clf(sent)[0]['label']
    return lab.lower()  # e.g., 'expectation' -> 'expect'

def map_sentence_to_aspects(sent):
    """
    Return list of aspect categories that have any lexicon term in the sentence.
    Simplest: look for any ontology term lemma in the preprocessed tokens.
    """
    tokens = preprocess_text(sent)
    cats = set()
    for cat, lex in ontology.items():
        for term in lex:
            if term in tokens:
                cats.add(cat)
                break
    return list(cats)


## 4. Build Expectation & Experience Sets


In [None]:
# prepare storage
records = []

for idx, row in df.iterrows():
    text = row['feedback_text_clean']  # your cleaned free-text field
    sents = segment_sentences(text)
    # classify and tag each sentence
    for sent in sents:
        typ = classify_sentence_type(sent)
        aspects = map_sentence_to_aspects(sent)
        if not aspects:
            continue
        # sentiment of the sentence (−1,0,+1)
        sentiment = sentiment_pipe(sent)[0]
        score = {'NEGATIVE':-1,'NEUTRAL':0,'POSITIVE':1}[sentiment['label']]
        records.append({
            'id': idx,
            'sentence': sent,
            'type': typ,            # 'expect', 'experience', or 'other'
            'aspects': aspects,     # list of mapped aspect categories
            'sentiment': score
        })

gap_df = pd.DataFrame(records)
gap_df.head()


## 5. Compute Per-Aspect Metrics per Intern


In [None]:
# initialize result storage
res = []

alpha = 0.6  # balance parameter, tuned earlier

for idx, group in gap_df.groupby('id'):
    # overall satisfaction if available
    overall = df.loc[idx, 'overall_satisfaction'] if 'overall_satisfaction' in df.columns else np.nan

    # for each aspect:
    for aspect in aspect_categories:
        sub = group[group['aspects'].apply(lambda L: aspect in L)]
        exp_sents = sub[sub['type']=='expect']['sentence'].tolist()
        exp_scores = sub[sub['type']=='expect']['sentiment'].tolist()
        exp_embeds = embedder.encode(exp_sents) if exp_sents else np.zeros((1,768))

        exp_mean_sent = np.mean(exp_scores) if exp_scores else 0.0
        exp_centroid = exp_embeds.mean(axis=0) if len(exp_sents)>0 else np.zeros((768,))

        exp_sents2 = sub[sub['type']=='experience']['sentence'].tolist()
        exp2_scores = sub[sub['type']=='experience']['sentiment'].tolist()
        exp2_embeds = embedder.encode(exp_sents2) if exp_sents2 else np.zeros((1,768))

        exp2_mean_sent = np.mean(exp2_scores) if exp2_scores else 0.0
        exp2_centroid = exp2_embeds.mean(axis=0) if len(exp_sents2)>0 else np.zeros((768,))

        # semantic gap
        sim = cosine_similarity(
            exp_centroid.reshape(1,-1),
            exp2_centroid.reshape(1,-1)
        )[0,0]
        sem_gap = 1 - sim

        # sentiment gap
        sent_gap = exp2_mean_sent - exp_mean_sent

        # hybrid gap
        hyb_gap = alpha*sem_gap + (1-alpha)*abs(sent_gap)

        res.append({
            'id': idx,
            'aspect': aspect,
            'exp_mean_sent': exp_mean_sent,
            'exp2_mean_sent': exp2_mean_sent,
            'semantic_gap': sem_gap,
            'sentiment_gap': sent_gap,
            'hybrid_gap': hyb_gap,
            'overall_satisfaction': overall
        })

metrics_df = pd.DataFrame(res)
metrics_df.head()


## 6. Visualizations


In [None]:
# 6.1 Sentence type distribution
plt.figure(figsize=(6,4))
sns.countplot(data=gap_df, x='type')
plt.title("Sentence Type Distribution")
plt.xlabel("Type")
plt.ylabel("Count")
plt.show()

# 6.2 Hybrid gap boxplot per aspect
plt.figure(figsize=(10,6))
sns.boxplot(data=metrics_df, x='aspect', y='hybrid_gap')
plt.title("Hybrid Gap per Aspect")
plt.xticks(rotation=45)
plt.show()

# 6.3 Correlation between overall satisfaction and hybrid gaps
corr = metrics_df[['overall_satisfaction','hybrid_gap']].corr().iloc[0,1]
print(f"Pearson r = {corr:.2f}")
sns.scatterplot(data=metrics_df, x='hybrid_gap', y='overall_satisfaction', alpha=0.3)
plt.title("Overall Satisfaction vs. Hybrid Gap")
plt.show()


## 7. Save Metrics & Models


In [None]:
# save results
metrics_df.to_excel("gap_analysis_metrics.xlsx", index=False)
# (optional) save alpha for later
dump(alpha, "models/hybrid_gap_alpha.joblib")
