# Methodology

## Data Preparation

In [1]:
import numpy as np
import pandas as pd
import gensim.downloader as api

from datasets import load_dataset
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import CountVectorizer
# Custom modules
from src.preprocessing import get_preprocessed_data
from src.evaluate import run_full_evaluation

SEED = 42
np.random.seed(SEED)

### Filter other attributes
For this first simple approach we will only use the answer and the label.

In [2]:
# Load Data
train_df, val_df, test_df, id2label, label2id = get_preprocessed_data()

X_train = train_df["interview_answer"]
y_clarity = train_df["clarity_label"]
y_evasion = train_df["evasion_label"]

# Define Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# Vectorizers

## Sparse

In [3]:
def run_baseline_experiment(X, y_clarity, y_evasion, vectorizer=None, dense_matrix=None):
    """
    Runs CV for both labels using the provided vectorizer (sparse) 
    or pre-calculated matrix (dense).
    """
    results = {}
    clf = LogisticRegression(max_iter=1000, random_state=SEED, multi_class="multinomial")
    
    for label_name, y in [("Clarity", y_clarity), ("Evasion", y_evasion)]:
        if vectorizer:
            # Sparse Pipeline
            pipeline = Pipeline([("vec", vectorizer), ("clf", clf)])
            cv_res = cross_validate(pipeline, X, y, cv=skf, scoring="f1_macro")
        else:
            # Dense Evaluation
            cv_res = cross_validate(clf, dense_matrix, y, cv=skf, scoring="f1_macro")
            
        results[f"{label_name}_F1"] = cv_res["test_score"].mean()
    
    return results

In [5]:
# Define Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

all_results = []

# 1. CountVectorizer
res_count = run_baseline_experiment(X_train, y_clarity, y_evasion, 
                                    vectorizer=CountVectorizer(max_features=5000, stop_words="english"))
res_count['Embedding'] = 'CountVectorizer'
all_results.append(res_count)

# 2. TF-IDF
res_tfidf = run_baseline_experiment(X_train, y_clarity, y_evasion, 
                                    vectorizer=TfidfVectorizer(max_features=5000, stop_words="english"))
res_tfidf['Embedding'] = 'TF-IDF'
all_results.append(res_tfidf)



### Dense

In [4]:
def avg_embed(text, model, dim):
    words = text.lower().split()
    vecs = [model[w] for w in words if w in model]
    
    if not vecs:
        return np.zeros(dim)
    
    return np.mean(vecs, axis=0)

In [6]:
def get_dense_matrix(texts, model, dim):
    return np.vstack([avg_embed(t, model, dim) for t in texts])

# Load Models
glove = api.load("glove-wiki-gigaword-100")
w2v = api.load("word2vec-google-news-300")
fast = api.load("fasttext-wiki-news-subwords-300")

# 3. GloVe
X_glove = get_dense_matrix(X_train, glove, 100)
res_glove = run_baseline_experiment(None, y_clarity, y_evasion, dense_matrix=X_glove)
res_glove['Embedding'] = 'GloVe'
all_results.append(res_glove)

# 4. Word2Vec
X_w2v = get_dense_matrix(X_train, w2v, 300)
res_w2v = run_baseline_experiment(None, y_clarity, y_evasion, dense_matrix=X_w2v)
res_w2v['Embedding'] = 'Word2Vec'
all_results.append(res_w2v)

# 5. FastText
X_fast = get_dense_matrix(X_train, fast, 300)
res_fast = run_baseline_experiment(None, y_clarity, y_evasion, dense_matrix=X_fast)
res_fast['Embedding'] = 'FastText'
all_results.append(res_fast)



In [7]:
df_results = pd.DataFrame(all_results)[['Embedding', 'Clarity_F1', 'Evasion_F1']]
print("\nBaseline Comparison Table:")
print(df_results.to_markdown(index=False))


Baseline Comparison Table:
| Embedding       |   Clarity_F1 |   Evasion_F1 |
|:----------------|-------------:|-------------:|
| CountVectorizer |     0.494116 |    0.276242  |
| TF-IDF          |     0.408525 |    0.186013  |
| GloVe           |     0.379218 |    0.130786  |
| Word2Vec        |     0.388076 |    0.129144  |
| FastText        |     0.317322 |    0.0691809 |
