# Methodology

## Data Preparation

In [2]:
import numpy as np
import pandas as pd
import gensim.downloader as api

from datasets import load_dataset
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import CountVectorizer

SEED = 42
np.random.seed(SEED)

In [3]:
ds = load_dataset("ailsntua/QEvasion")

# encode target
ds["train"] = ds["train"].class_encode_column("clarity_label")
ds["test"]  = ds["test"].cast_column(
    "clarity_label",
    ds["train"].features["clarity_label"]
)

# label dictionaries
label_feature = ds["train"].features["clarity_label"]
id2label = dict(enumerate(label_feature.names))
label2id = {v:k for k,v in id2label.items()}

print(id2label)

# stratified split
sub_split = ds["train"].train_test_split(
    test_size=0.2,
    stratify_by_column="clarity_label",
    seed=42
)

train_set = sub_split["train"]
val_set   = sub_split["test"]
test_set  = ds["test"]

train_df = train_set.to_pandas()
val_df = val_set.to_pandas()
test_df = test_set.to_pandas()

print(f"Final Setup:")
print(f"Train size: {len(train_df)}")
print(f"Val size:   {len(val_df)}")
print(f"Test size:  {len(test_df)}")

train_df.head()

{0: 'Ambivalent', 1: 'Clear Non-Reply', 2: 'Clear Reply'}
Final Setup:
Train size: 2758
Val size:   690
Test size:  308


Unnamed: 0,title,date,president,url,question_order,interview_question,interview_answer,gpt3.5_summary,gpt3.5_prediction,question,annotator_id,annotator1,annotator2,annotator3,inaudible,multiple_questions,affirmative_questions,index,clarity_label,evasion_label
0,The President's News Conference With President...,"April 30, 2018",Donald J. Trump,https://www.presidency.ucsb.edu/documents/the-...,7,Q. The fight against corruption is one of the ...,Yes. We have actually discussed all of those t...,The question consists of one part:\n\n1. To wh...,Question part: 1. To what extent did you discu...,To what extent did you discuss the need to rep...,85,,,,False,False,False,1607,0,General
1,The President's News Conference in New York City,"September 26, 2018",Donald J. Trump,https://www.presidency.ucsb.edu/documents/the-...,11,Q. So you don't think anyone in your administr...,"I don't think so. Well, yes—enemies, sure. You...",The question consists of 1 part:\n\n1. Discuss...,Question part: 1. Discussing the usage of the ...,Discussing the usage of the 25th Amendment aga...,89,,,,False,False,True,1394,2,Explicit
2,The President's News Conference,"May 11, 2020",Donald J. Trump,https://www.presidency.ucsb.edu/documents/the-...,1,"Q. And what do you say, Mr. President, to othe...","Well, I think, you know, we have a lot of peop...",The question consists of 2 parts:\n\n1. What d...,Question part: 1. What does the President say ...,What does the President say to other companies...,85,,,,False,False,False,791,0,Dodging
3,The President's News Conference With Prime Min...,"November 16, 2011",Barack Obama,https://www.presidency.ucsb.edu/documents/the-...,3,"Q. Thanks, Ma'am. This is Mark Riley from 7New...","Good. Well, first of all, with respect to Indi...",The question consists of 2 parts:\n\nPart 1: S...,Question part: Part 1 - Significance of Austra...,Has this become too politically hard for you?,89,,,,False,False,False,2380,0,Dodging
4,The President's News Conference,"September 07, 2020",Donald J. Trump,https://www.presidency.ucsb.edu/documents/the-...,14,Q. ——why do you object to that being taught in...,"Yes, so—no, I want everybody to know everythin...","The question consists of 2 parts: \n1. ""Why do...","Question part: 1. ""Why do you object to that b...",Do you object to slavery itself being taught i...,85,,,,False,False,False,479,2,Explicit


### Filter other attributes
For this first simple approach we will only use the answer and the label.

In [4]:
X_train_raw = train_df["interview_answer"]
y_train = train_df["clarity_label"]

X_val_raw = val_df["interview_answer"]
y_val = val_df["clarity_label"]

X_test_raw = test_df["interview_answer"]
y_test = test_df["clarity_label"]

print("Classes:", y_train.unique())

Classes: [0 2 1]


## CV + Metric

In [5]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=SEED
)

scoring = "f1_macro"

## Feature representation

### Sparse Vector

#### TF-IDF

In [6]:
tfidf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=5000,
        ngram_range =(1, 2),
        min_df = 5,
        max_df = 0.9 ,
        stop_words="english"
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        random_state=SEED,
        multi_class="multinomial",
        solver="lbfgs"
    ))
])

In [7]:
cv_tfidf = cross_validate(
    tfidf_pipeline,
    X_train_raw,
    y_train,
    cv=skf,
    scoring=scoring,
    return_train_score=False
)

print("TF-IDF CV F1-Macro per fold:", cv_tfidf["test_score"])
print("TF-IDF Mean F1-Macro:", cv_tfidf["test_score"].mean())



TF-IDF CV F1-Macro per fold: [0.36599401 0.39914357 0.43041538 0.4459225  0.44837533]
TF-IDF Mean F1-Macro: 0.4179701583709125


#### Count Vectorizer

In [8]:
count_pipeline = Pipeline([
    ("count", CountVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        min_df=5,
        max_df=0.9,
        stop_words="english"
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        random_state=SEED,
        multi_class="multinomial",
        solver="lbfgs"
    ))
])

In [9]:
cv_count = cross_validate(
    count_pipeline,
    X_train_raw,
    y_train,
    cv=skf,
    scoring=scoring,
    return_train_score=False
)

print("Count CV F1-Macro per fold:", cv_count["test_score"])
print("Count Mean F1-Macro:", cv_count["test_score"].mean())



Count CV F1-Macro per fold: [0.48486604 0.4761468  0.50499297 0.51087814 0.49581335]
Count Mean F1-Macro: 0.4945394584990129


### Dense

In [10]:
def avg_embed(text, model, dim):
    words = text.lower().split()
    vecs = [model[w] for w in words if w in model]
    
    if not vecs:
        return np.zeros(dim)
    
    return np.mean(vecs, axis=0)

In [11]:
dense_clf = LogisticRegression(
    max_iter=1000,
    random_state=42,
    multi_class="multinomial",
    solver="lbfgs"
)

#### GloVe

In [12]:
# Load GloVe embeddings 
glove = api.load("glove-wiki-gigaword-100")

In [13]:
X_train_glove = np.vstack([
    avg_embed(t, glove, 100) for t in X_train_raw
])

X_val_glove = np.vstack([
    avg_embed(t, glove, 100) for t in X_val_raw
])

print(X_train_glove.shape)

(2758, 100)


In [14]:
glove_clf = LogisticRegression(
    max_iter=1000,
    random_state=SEED,
    multi_class="multinomial",
    solver="lbfgs"
)

cv_glove = cross_validate(
    glove_clf,
    X_train_glove,
    y_train,
    cv=skf,
    scoring=scoring
)

print("glove CV F1-Macro per fold:", cv_glove["test_score"])
print("glove Mean F1-Macro:", cv_glove["test_score"].mean())



glove CV F1-Macro per fold: [0.34883095 0.3989301  0.36871941 0.40278687 0.37682343]
glove Mean F1-Macro: 0.3792181535731519




#### Word2Vec

In [15]:
w2v = api.load("word2vec-google-news-300")

In [16]:
X_train_w2v = np.vstack([
    avg_embed(t, w2v, 300) for t in X_train_raw
])

In [17]:
cv_w2v = cross_validate(
    dense_clf,
    X_train_w2v,
    y_train,
    cv=skf,
    scoring="f1_macro"
)

print("Word2Vec CV per fold:", cv_w2v["test_score"])
print("Word2Vec mean:", cv_w2v["test_score"].mean())



Word2Vec CV per fold: [0.37214286 0.41996482 0.36398328 0.4101205  0.37416784]
Word2Vec mean: 0.388075859819823


#### FastText

In [18]:
fast = api.load("fasttext-wiki-news-subwords-300")

In [19]:
X_train_fast = np.vstack([
    avg_embed(t, fast, 300) for t in X_train_raw
])

In [20]:
cv_fast = cross_validate(
    dense_clf,
    X_train_fast,
    y_train,
    cv=skf,
    scoring="f1_macro"
)

print("FastText CV per fold:", cv_fast["test_score"])
print("FastText mean:", cv_fast["test_score"].mean())



FastText CV per fold: [0.28793747 0.36654203 0.2808489  0.32594169 0.32534093]
FastText mean: 0.3173222035906951


## Comparison

In [21]:
print(cv_glove["test_score"])

[0.34883095 0.3989301  0.36871941 0.40278687 0.37682343]


In [22]:
print("\n===== CV RESULTS =====")
print("TF-IDF  Mean F1:", cv_tfidf["test_score"].mean())
print("CountVectorizer  Mean F1:", cv_count["test_score"].mean())
print("glove   Mean F1:", cv_glove["test_score"].mean())
print("W2V  :", cv_w2v["test_score"].mean())
print("FAST :", cv_fast["test_score"].mean())


===== CV RESULTS =====
TF-IDF  Mean F1: 0.4179701583709125
CountVectorizer  Mean F1: 0.4945394584990129
glove   Mean F1: 0.3792181535731519
W2V  : 0.388075859819823
FAST : 0.3173222035906951
