# 02_Component1_ABSA.ipynb


In [2]:
# data + viz
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
from transformers import pipeline
from joblib import load, dump

# evaluation
from sklearn.metrics import precision_recall_fscore_support, classification_report

# styling
sns.set(style="whitegrid")


ModuleNotFoundError: No module named 'spacy'

Load spaCy model (for dependency parsing) and Transformers tokenizer/models



In [None]:
# spaCy for rules
nlp = spacy.load("en_core_web_sm")

# Transformers for aspect extraction (token classification) and sentiment
asp_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")  # placeholder; replace with your fine-tuned model
asp_model     = AutoModelForTokenClassification.from_pretrained("path/to/fine-tuned-aspect-extractor")

sent_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
sent_model     = AutoModelForSequenceClassification.from_pretrained("path/to/fine-tuned-absa-sentiment")


## 2. Load Preprocessed Data


In [None]:
df = pd.read_excel("cleaned_feedback_preprocessed.xlsx")
# Identify the cleaned text column
text_col = "feedback_text_clean"  # adjust if different
print("Records:", len(df))


## 3. Load Aspect Ontology


In [None]:
import json

with open("config/aspect_ontology.json") as f:
    ontology = json.load(f)
# ontology = {"mentorship": ["mentor", "supervisor", ...], ...}
aspect_categories = list(ontology.keys())


## 4. Aspect Term Extraction
### 4.1 Rule-Based Extraction


In [None]:
def rule_extract(text):
    doc = nlp(text)
    found = []
    for token in doc:
        if token.pos_ in ("NOUN","PROPN") and token.lemma_ in sum(ontology.values(),[]):
            found.append((token.lemma_, token.i))
    return found


### 4.2 Transformer-Based Extraction


In [None]:
asp_pipe = pipeline(
    "ner",
    model=asp_model,
    tokenizer=asp_tokenizer,
    aggregation_strategy="simple"
)

def ner_extract(text):
    ents = asp_pipe(text)
    # ents: [{"entity_group":"ASPECT","word":"mentor","start":..,"end":..}, ...]
    return [(e["word"], e["start"], e["end"]) for e in ents]


### 4.3 Hybrid Extraction


In [None]:
def extract_aspects(text):
    rules = rule_extract(text)
    ner   = ner_extract(text)
    # unify by term
    terms = set([r[0] for r in rules] + [n[0].lower() for n in ner])
    return list(terms)

# example
print(extract_aspects(df[text_col].iloc[0]))


## 5. Category Detection & Disambiguation


In [None]:
from difflib import get_close_matches

def map_to_category(term):
    # direct lexicon match
    for cat, lex in ontology.items():
        if term in lex:
            return cat
    # fuzzy match
    for cat, lex in ontology.items():
        if get_close_matches(term, lex, n=1, cutoff=0.8):
            return cat
    return None


## 6. Sentiment Polarity Classification


In [None]:
sent_pipe = pipeline(
    "sentiment-analysis",
    model=sent_model,
    tokenizer=sent_tokenizer,
    function_to_apply="none"  # ensure we get logits
)

def classify_sentiment(text, aspect_term):
    # mark aspect in text
    marked = text.replace(aspect_term, f"[ASP]{aspect_term}[ASP]")
    pred = sent_pipe(marked)[0]
    # pred: {"label":"POSITIVE","score":0.98}
    label = pred["label"]
    return {"POSITIVE":1, "NEUTRAL":0, "NEGATIVE":-1}[label]


## 7. Run ABSA Pipeline


In [None]:
records = []
for idx, row in df.iterrows():
    text = row[text_col]
    terms = extract_aspects(text)
    for term in terms:
        cat = map_to_category(term)
        if not cat: continue
        polarity = classify_sentiment(text, term)
        records.append({
            "id": idx,
            "term": term,
            "category": cat,
            "polarity": polarity,
            "text": text
        })
absa_df = pd.DataFrame(records)
len(absa_df)


## 8. Evaluation on Annotated Test Set


In [None]:
# assuming you have true labels in absa_test.csv with columns: id, term, category, polarity
test = pd.read_csv("data/absa_test.csv")
# merge predictions
eval_df = test.merge(absa_df, on=["id","term"], suffixes=("_true","_pred"))
print(classification_report(eval_df["category_true"], eval_df["category_pred"]))
print(classification_report(eval_df["polarity_true"], eval_df["polarity_pred"]))


## 9. Visualizations


In [None]:
# 9.1 Aspect mention counts
plt.figure(figsize=(8,4))
absa_df["category"].value_counts().plot(kind="bar")
plt.title("Aspect Mention Counts")
plt.ylabel("Mentions")
plt.show()

# 9.2 Sentiment distribution per aspect
plt.figure(figsize=(10,6))
sns.countplot(data=absa_df, x="category", hue="polarity")
plt.title("Sentiment by Aspect")
plt.xlabel("Aspect")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.legend(title="Polarity", labels=["Neg","Neu","Pos"])
plt.show()


## 10. Save Outputs & Models


In [None]:
absa_df.to_excel("absa_aspect_sentiment.xlsx", index=False)
dump(ontology, "models/aspect_ontology.joblib")
dump(asp_model, "models/aspect_extractor_model.joblib")
dump(sent_model, "models/sentiment_classifier_model.joblib")
