In [40]:
#DATA_PATH = "C:\\Users\\ahmed\\Downloads\\aspect_extraction_and_classification\\Laptop_Train_v2.xml"
DATA_PATH = "C:\\Users\\ahmed\\Downloads\\aspect_extraction_and_classification\\final_dataset.jsonl"  
RANDOM_STATE = 42
TEST_SIZE = 0.2
WINDOW_SIZE = 5 
DROP_CONFLICT = True

In [19]:
from lxml import etree

def load_semeval_xml(path: str):
    """
    Parse SemEval ABSA XML file into a list of dicts:
    {
      "id": str,
      "text": str,
      "aspects": [
         {"term": str, "polarity": str, "from": int, "to": int},
         ...
      ]
    }
    """
    tree = etree.parse(path)
    root = tree.getroot()

    data = []

    for sentence in root.findall("sentence"):
        sent_id = sentence.get("id")
        text_elem = sentence.find("text")
        if text_elem is None:
            continue
        text = text_elem.text.strip()

        aspects = []
        aspect_terms = sentence.find("aspectTerms")

        if aspect_terms is not None:
            for term in aspect_terms.findall("aspectTerm"):
                aspects.append({
                    "term": term.get("term"),
                    "polarity": term.get("polarity"),
                    "from": int(term.get("from")),
                    "to": int(term.get("to")),
                })

        data.append({
            "id": sent_id,
            "text": text,
            "aspects": aspects,
        })

    return data

print(load_semeval_xml(DATA_PATH)[0])

{'id': '2339', 'text': 'I charge it at night and skip taking the cord with me because of the good battery life.', 'aspects': [{'term': 'cord', 'polarity': 'neutral', 'from': 41, 'to': 45}, {'term': 'battery life', 'polarity': 'positive', 'from': 74, 'to': 86}]}


In [None]:
import json

def load_jsonl_aspects(path: str):
    """
    Each line in the JSONL file is a dict:
    {
      "id": int,
      "sentence": str,
      "aspect_terms": [
          {"term": "...", "polarity": "...", "from": int, "to": int}
      ]
    }
    Returns the same structure as load_semeval_xml().
    """
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)

            sentence = obj["sentence"]
            aspects = []

            for asp in obj.get("aspect_terms", []):
                aspects.append({
                    "term": asp["term"],
                    "polarity": asp["polarity"],
                    "from": asp["from"],
                    "to": asp["to"],
                })

            data.append({
                "id": obj["id"],
                "text": sentence,
                "aspects": aspects
            })

    return data


{'id': 1, 'text': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^', 'aspects': [{'term': 'sound track', 'polarity': 'positive', 'from': 5, 'to': 16}, {'term': 'music', 'polarity': 'positive', 'from': 128, 'to': 133}, {'term': 'guitars', 'polarity': 'positive', 'from': 314, 'to': 321}, {'term': 'orchestras', 'polarity': 'positive', 'from': 334, 'to': 344}, {'term': 'keyboarding', 'polarity': 'negative', 'from': 266, 'to': 277}]}


In [None]:
import re

def clean_text(text: str) -> str:
    """
    Simple English text cleaning:
    - lowercasing
    - remove HTML-like tags
    - normalize whitespace
    (You can add more if you want.)
    """
    text = text.strip()
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)         
    text = re.sub(r"\s+", " ", text)            
    return text


In [22]:
# absa/aspect_windows.py

import re
import pandas as pd


def char_to_token_window(text: str, start_char: int, end_char: int, window_size: int = 5) -> str:
    """
    Convert char offsets into a token-level window of ±window_size words
    around the aspect, and insert <ASP> ... </ASP> markers.
    """
    # tokenization by whitespace
    tokens = []
    starts = []
    for m in re.finditer(r'\S+', text):
        tokens.append(m.group())
        starts.append(m.start())

    aspect_start_tok = None
    aspect_end_tok = None

    for i, (tok, s) in enumerate(zip(tokens, starts)):
        e = s + len(tok)
        # aspect start char falls into this token
        if s <= start_char < e and aspect_start_tok is None:
            aspect_start_tok = i
        # aspect end char falls into this (or previous) token
        if s < end_char <= e:
            aspect_end_tok = i
            break

    if aspect_start_tok is None:
        # fallback: full sentence if mapping fails
        return text

    if aspect_end_tok is None:
        aspect_end_tok = aspect_start_tok

    left = max(0, aspect_start_tok - window_size)
    right = min(len(tokens), aspect_end_tok + 1 + window_size)

    window_tokens = tokens[left:right]

    rel_start = aspect_start_tok - left
    rel_end = aspect_end_tok - left

    # Insert tags around aspect span
    window_tokens.insert(rel_start, "<ASP>")
    # +2 because inserting opening tag shifts indices
    window_tokens.insert(rel_end + 2, "</ASP>")

    return " ".join(window_tokens)


def build_apc_dataset_with_windows(parsed_xml, window_size: int = 5) -> pd.DataFrame:
    """
    Build a DataFrame with columns:
      - sentence: original sentence text (cleaned)
      - aspect: aspect term (cleaned)
      - polarity: label
      - window: aspect-centered window with <ASP> tags
      - input_full: aspect + [SEP] + full sentence (for comparison)
    """
    rows = []

    for item in parsed_xml:
        raw_text = item["text"]
        text = clean_text(raw_text)

        for asp in item["aspects"]:
            term = asp["term"]
            pol = asp["polarity"]
            start = asp["from"]
            end = asp["to"]

            # Build window using original text (for offsets)
            window_raw = char_to_token_window(raw_text, start, end, window_size=window_size)
            window = clean_text(window_raw)
            aspect_clean = clean_text(term)

            input_full = f"{aspect_clean} [SEP] {text}"

            rows.append({
                "sentence": text,
                "sentence_raw":raw_text,
                "aspect": aspect_clean,
                "polarity": pol,
                "window": window,
                "input_full": input_full,
            })

    df = pd.DataFrame(rows)
    return df

In [36]:

import os
import numpy as np
import fasttext
import fasttext.util
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score


FASTTEXT_BIN = "cc.en.300.bin"

def load_fasttext_model():
    """
    Load (or download) pretrained FastText English model.
    """
    if not os.path.exists(FASTTEXT_BIN):
        print("Downloading FastText English model (cc.en.300.bin)...")
        fasttext.util.download_model('en', if_exists='ignore')  # creates cc.en.300.bin
    model = fasttext.load_model(FASTTEXT_BIN)
    return model


def build_fasttext_matrix(texts, ft_model):
    """
    texts: Iterable[str] of input strings (we'll use the 'window' column).
    Returns: np.array [n_samples, dim]
    """
    vectors = []
    for t in texts:
        v = ft_model.get_sentence_vector(t)
        vectors.append(v)
    return np.vstack(vectors)


def train_fasttext_svm(X_train, y_train,X_test=None,y_test=None):
    clf = LinearSVC(C=1.0, random_state=RANDOM_STATE)
    clf.fit(X_train, y_train)
    acc = clf.score(X_train,y_train)
    
    print("\n===== FastText + SVM =====")
    print(f"Training Accuracy: {acc:.4f}")
    
    if X_test is not None and y_test is not None:
        y_pred = clf.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        print(f"Test Accuracy: {test_acc:.4f}")
        print(classification_report(y_test, y_pred, digits=4))

    return clf, acc

    

In [38]:
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

def evaluate_model(name, model, X_train, y_train, X_test=None, y_test=None):
    print(f"\n===== {name} =====")
    model.fit(X_train, y_train)

    acc = model.score(X_train,y_train)
    print(f"Training Accuracy: {acc:.4f}")
    
    if X_test is not None and y_test is not None:
        y_pred = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        print(f"Test Accuracy: {test_acc:.4f}")
        print(classification_report(y_test, y_pred, digits=4))
        
    return {
        "model": name,
        "accuracy": acc,
    }


def summarize_results(results_list):
    df = pd.DataFrame(results_list)
    print("\n=== SUMMARY (classical models) ===")
    print(df.sort_values("accuracy", ascending=False))
    return df

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion

def build_vectorizer():
    """
    Build a FeatureUnion of:
      - word-level TF-IDF (1–2 grams)
      - char-level TF-IDF (3–5 char grams)
    Input: one string per sample (we'll feed "window" column).
    """
    word_tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=10000,
        sublinear_tf=True,
        analyzer="word"
    )

    char_tfidf = TfidfVectorizer(
        ngram_range=(3, 5),
        max_features=20000,
        sublinear_tf=True,
        analyzer="char"
    )

    vectorizer = FeatureUnion([
        ("word", word_tfidf),
        ("char", char_tfidf),
    ])

    return vectorizer


In [31]:

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier



def get_base_models():
    """
    Return individual base models.
    """
    logreg = LogisticRegression(
        max_iter=1000,
        C=1.0,
        class_weight=None,
        random_state=RANDOM_STATE
    )

    svm = LinearSVC(
        C=1.0
    )

    knn = KNeighborsClassifier(
        n_neighbors=5
    )

    dt = DecisionTreeClassifier(
        max_depth=15,
        random_state=RANDOM_STATE
    )

    models = {
        "logreg": logreg,
        "svm": svm,
        "knn": knn,
        "dt": dt,
    }
    return models


def get_ensemble(models_dict):
    """
    Build a soft voting ensemble (where possible).
    For LinearSVC (no predict_proba), we use hard voting.
    """
    estimators = [(name, m) for name, m in models_dict.items()]
    ensemble = VotingClassifier(
        estimators=estimators,
        voting="hard"   # soft requires predict_proba; LinearSVC doesn't have it
    )
    return ensemble


In [42]:
import os
import sys
import numpy as np
import pandas as pd
from jsonl_loader import load_jsonl_aspects
from sklearn.model_selection import train_test_split

print(f"Loading XML data from: {DATA_PATH}")
#parsed_xml = load_semeval_xml(DATA_PATH)
parsed_jsonl = load_jsonl_aspects(DATA_PATH)
parsed = parsed_jsonl 

df = build_apc_dataset_with_windows(parsed, window_size=WINDOW_SIZE)
df = df[df["polarity"]!="conflict"].reset_index(drop=True)
# Optionally: drop 'conflict' if it’s too rare and hurting training
# df = df[df["polarity"] != "conflict"].reset_index(drop=True)

print(f"Total aspect instances: {len(df)}")
print(df.head())

# Use aspect-centered window with <ASP> tags as main input
texts = df["window"].values
texts_raw = df["sentence_raw"].values
labels = df["polarity"].values

# Stratified split for fair evaluation
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts,
    labels,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels
)
X_train_raw, X_test_raw, _, _ = train_test_split(
    texts_raw, labels,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels
)
# ====================================
# 2. Classical ML: TF-IDF (word+char)
# ====================================
print("\nBuilding TF-IDF (word + char) vectorizer...")
vectorizer = build_vectorizer()

print("Fitting TF-IDF on training data...")
X_train_vec = vectorizer.fit_transform(X_train_texts)
X_test_vec = vectorizer.transform(X_test_texts)

# 2.1 Train individual models
base_models = get_base_models()
results = []

for name, model in base_models.items():
    res = evaluate_model(name, model, X_train_vec, y_train, X_test_vec, y_test)
    results.append(res)

# 2.2 Train ensemble
ensemble = get_ensemble(base_models)
res_ens = evaluate_model("ensemble", ensemble, X_train_vec, y_train, X_test_vec, y_test)
results.append(res_ens)

summary_df = summarize_results(results)

# ====================================
# 3. FastText + SVM comparison
# ====================================
print("\nLoading FastText model...")
ft_model = load_fasttext_model()

print("Building FastText sentence vectors for train and test...")
X_train_ft = build_fasttext_matrix(X_train_raw, ft_model)
X_test_ft = build_fasttext_matrix(X_test_raw, ft_model)

ft_clf, ft_acc = train_fasttext_svm(X_train_ft, y_train, X_test_ft, y_test)

print("\n=== FINAL COMPARISON ===")
print(summary_df.sort_values("accuracy", ascending=False))
print(f"\nFastText + SVM accuracy: {ft_acc:.4f}")


Loading XML data from: C:\Users\ahmed\Downloads\aspect_extraction_and_classification\final_dataset.jsonl
Total aspect instances: 20678
                                            sentence  \
0  this sound track was beautiful! it paints the ...   
1  this sound track was beautiful! it paints the ...   
2  this sound track was beautiful! it paints the ...   
3  this sound track was beautiful! it paints the ...   
4  this sound track was beautiful! it paints the ...   

                                        sentence_raw       aspect  polarity  \
0  This sound track was beautiful! It paints the ...  sound track  positive   
1  This sound track was beautiful! It paints the ...        music  positive   
2  This sound track was beautiful! It paints the ...      guitars  positive   
3  This sound track was beautiful! It paints the ...   orchestras  positive   
4  This sound track was beautiful! It paints the ...  keyboarding  negative   

                                              window 

In [52]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
def evaluate_kfold(model, X, y, folds=5):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=RANDOM_STATE)
    scores = cross_val_score(model, X, y, cv=skf, scoring="accuracy")
    return scores.mean(), scores.std()


# --------------------------------------------------------
# 2. RUN K-FOLD EVALUATION FOR ALL CLASSICAL MODELS
# --------------------------------------------------------
def evaluate_all_models_kfold(X_vec, y, folds=5):
    base_models = get_base_models()
    results = {}

    for name, model in base_models.items():
        mean_acc, std_acc = evaluate_kfold(model, X_vec, y, folds=folds)
        print(f"{name}: {mean_acc:.4f} (+/- {std_acc:.4f})")
        results[name] = (mean_acc, std_acc)

    return results


# --------------------------------------------------------
# 3. FASTTEXT + SVM K-FOLD EVALUATION
# --------------------------------------------------------
def evaluate_fasttext_kfold(texts_raw, labels, ft_model, folds=5):
    X_ft = build_fasttext_matrix(texts_raw, ft_model)
    svm = LinearSVC(C=1.0, random_state=RANDOM_STATE)

    mean_acc, std_acc = evaluate_kfold(svm, X_ft, labels, folds=folds)
    print(f"FastText + SVM: {mean_acc:.4f} (+/- {std_acc:.4f})")

    return mean_acc, std_acc, X_ft


# ---------------------------------------------------------------
# 4. HYPERPARAMETER SEARCH FOR THE SELECTED MODEL (SVM EXAMPLE)
# ---------------------------------------------------------------
def svm_hyperparameter_search(X, y, C_values=[0.01, 0.1, 1, 2, 5], folds=5):
    best_C = None
    best_acc = -1

    print("\n=== SVM Hyperparameter Search ===")
    for C in C_values:
        model = LinearSVC(C=C, random_state=RANDOM_STATE)
        mean_acc, std_acc = evaluate_kfold(model, X, y, folds=folds)
        print(f"C={C}: {mean_acc:.4f} (+/- {std_acc:.4f})")

        if mean_acc > best_acc:
            best_acc = mean_acc
            best_C = C

    print(f"\nBest C = {best_C} (accuracy = {best_acc:.4f})")
    return best_C


# ---------------------------------------------------------------
# 5. TRAIN FINAL MODEL ON FULL TRAINING SET
# ---------------------------------------------------------------
def train_final_svm(X, y, C):
    model = LinearSVC(C=C, random_state=RANDOM_STATE)
    model.fit(X, y)
    return model


# ---------------------------------------------------------------
# 6. EVALUATE FINAL MODEL ON SEPARATE TEST SET
# ---------------------------------------------------------------
from sklearn.metrics import classification_report, accuracy_score

def evaluate_final_model(model, X_test, y_test):
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"\n=== FINAL TEST ACCURACY ===\nAccuracy: {acc:.4f}")
    print(classification_report(y_test, preds, digits=4))
    return acc


In [53]:
# =============================================================
# CELL 10 — Stratified K-Fold Model Selection + Final Evaluation
# =============================================================

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
import numpy as np


# =============================================================
# PART 1 — Generic K-Fold evaluator (works with sparse or dense)
# =============================================================

def evaluate_kfold_estimator(name, estimator, X, y, folds=5):
    print(f"\nRunning K-Fold for: {name}")
    skf = StratifiedKFold(
        n_splits=folds,
        shuffle=True,
        random_state=RANDOM_STATE
    )

    y = np.array(y)
    scores = []

    for train_idx, val_idx in skf.split(X, y):

        # Sparse matrix slicing works directly:
        X_tr = X[train_idx]
        X_val = X[val_idx]

        y_tr = y[train_idx]
        y_val = y[val_idx]

        est = clone(estimator)
        est.fit(X_tr, y_tr)
        preds = est.predict(X_val)

        fold_acc = accuracy_score(y_val, preds)
        scores.append(fold_acc)

    mean_acc = float(np.mean(scores))
    std_acc  = float(np.std(scores))

    print(f"{name}: mean={mean_acc:.4f}  std={std_acc:.4f}")
    return mean_acc, std_acc


# =============================================================
# PART 2 — K-Fold for TF-IDF models
# =============================================================

def evaluate_all_models_kfold(X_vec, y, folds=5):
    base_models = get_base_models()   # your function
    results = {}

    for name, model in base_models.items():
        mean_acc, std_acc = evaluate_kfold_estimator(
            name, model, X_vec, y, folds=folds
        )
        results[name] = (mean_acc, std_acc)

    return results


# =============================================================
# PART 3 — FastText per-ASPECT VECTORS (critical fix)
# =============================================================

def evaluate_fasttext_kfold(text_windows_train, y_train, ft_model, folds=5):
    """
    IMPORTANT:
    We embed *WINDOWS* (or sentence_raw), NOT unique sentences.
    Each row corresponds to ONE ASPECT INSTANCE.
    """
    print("\nBuilding FastText vectors for K-Fold…")
    X_ft = build_fasttext_matrix(text_windows_train, ft_model)
    print("FastText matrix shape:", X_ft.shape)

    svm = LinearSVC(C=1.0, random_state=RANDOM_STATE)

    mean_acc, std_acc = evaluate_kfold_estimator(
        "fasttext_svm", svm, X_ft, y_train, folds=folds
    )

    return mean_acc, std_acc, X_ft


# =============================================================
# PART 4 — RUN K-FOLD ON TRAIN SET ONLY
# =============================================================

print("\n======================")
print("=== STRATIFIED K-FOLD (TRAIN SET ONLY)")
print("======================")

# TF-IDF models
cv_results = evaluate_all_models_kfold(X_train_vec, y_train, folds=5)

# FastText ASPECT-LEVEL vectors
ft_cv_mean, ft_cv_std, X_train_ft = evaluate_fasttext_kfold(
    X_train_texts,  # THE ASPECT WINDOW TEXTS (matches df rows)
    y_train,
    ft_model,
    folds=5
)

cv_results["fasttext_svm"] = (ft_cv_mean, ft_cv_std)

# Determine best model by mean accuracy
best_model_name = max(cv_results, key=lambda k: cv_results[k][0])
best_model_cv   = cv_results[best_model_name][0]

print(f"\n=== BEST MODEL SELECTED: {best_model_name} ({best_model_cv:.4f}) ===")


# =============================================================
# PART 5 — FINAL RETRAIN ON FULL TRAINING SET + TEST EVAL
# =============================================================

print("\n======================")
print("=== FINAL TEST EVALUATION ===")
print("======================")

if best_model_name == "fasttext_svm":
    # Build final test vectors
    X_test_ft = build_fasttext_matrix(X_test_texts, ft_model)

    clf = LinearSVC(C=1.0, random_state=RANDOM_STATE)
    clf.fit(X_train_ft, y_train)
    preds = clf.predict(X_test_ft)

    print("FastText+SVM Test Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

else:
    # Classical models (TF-IDF)
    base_models = get_base_models()
    clf = base_models[best_model_name]
    clf.fit(X_train_vec, y_train)
    preds = clf.predict(X_test_vec)

    print(f"{best_model_name} Test Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))


# Show summary table
print("\nCV RESULTS:")
for k, v in cv_results.items():
    print(f"{k}: mean={v[0]:.4f}, std={v[1]:.4f}")



=== STRATIFIED K-FOLD (TRAIN SET ONLY)

Running K-Fold for: logreg
logreg: mean=0.7020  std=0.0075

Running K-Fold for: svm
svm: mean=0.6885  std=0.0045

Running K-Fold for: knn
knn: mean=0.6234  std=0.0034

Running K-Fold for: dt
dt: mean=0.5617  std=0.0053

Building FastText vectors for K-Fold…
FastText matrix shape: (16542, 300)

Running K-Fold for: fasttext_svm
fasttext_svm: mean=0.6820  std=0.0034

=== BEST MODEL SELECTED: logreg (0.7020) ===

=== FINAL TEST EVALUATION ===
logreg Test Accuracy: 0.7142166344294004
              precision    recall  f1-score   support

    negative       0.70      0.80      0.75      1841
     neutral       0.57      0.14      0.22       482
    positive       0.74      0.78      0.76      1813

    accuracy                           0.71      4136
   macro avg       0.67      0.57      0.58      4136
weighted avg       0.70      0.71      0.69      4136


CV RESULTS:
logreg: mean=0.7020, std=0.0075
svm: mean=0.6885, std=0.0045
knn: mean=0.6234, st