# Approach
Our final system follows a three-step pipeline:
1) Sociodemographic selection:

    If the corpus provides sociodemographic information (i.e. in the explicit scenario), reduce the candidate pool to only those arguments that fit the sociogemographic target group.
2) Semantic selection: 

    Given a sentence representation (in our case from the multilingual SBERT model `paraphrase-multilingual-mpnet-base-v2`) of a query, select the 200 arguments with the most similar representations to the query.
    
3) Stylistic selection:

    Given a one-hot encoding of the sociodemographic target group and the stylistic feature, classify whether the stylistic pattern is relevant to/fitting for the group


For reproduction, download the respective dataset per cycle from [https://github.com/Blubberli/argmin2024-perspective/](https://github.com/Blubberli/argmin2024-perspective/)

In [1]:
import os
import sys

import pandas as pd
import pickle
import readability
import spacy
from tqdm.notebook import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_input.load_data import load_corpus, load_queries
from models.rankers import SentenceTransformerRanker
from models.utils import process_predictions

In [2]:
#ADJ: adjective
#ADP: adposition
#ADV: adverb
#AUX: auxiliary
#CCONJ: coordinating conjunction
#DET: determiner
#INTJ: interjection
#NOUN: noun
#NUM: numeral
#PART: particle
#PRON: pronoun
#PROPN: proper noun
#PUNCT: punctuation
#SCONJ: subordinating conjunction
#SYM: symbol
#VERB: verb
#X: other

upos_tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 
             'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']

nlp = spacy.load("de_core_news_sm")


def pos_features(text):
    doc = nlp(text)
    doc_pos = [token.pos_ for token in doc]
    doc_length = len(doc_pos)
    counts = {tag: doc_pos.count(tag)/doc_length for tag in upos_tags} # proportion of pos tags in text
    return counts


def entity_feature(text):
    doc = nlp(text)
    doc_ent = [token.ent_iob_ for token in doc]
    doc_length = len(doc_ent)
    count = (doc_ent.count('B')+doc_ent.count('I'))/doc_length # proportion of entities in text
    return count


def morph_features(text):
    doc = nlp(text)
    doc_morph = [token.morph for token in doc]
    doc_length = len(doc_morph)
    tense = sum([1 for token in doc_morph if "Tense=Pres" in token])/doc_length
    mood = sum([1 for token in doc_morph if "Mood=Imp" in token])/doc_length
    person = sum([1 for token in doc_morph if "Person=1" in token])/doc_length
    return {"past_tense": tense, "imperative": mood, "first_person": person}


In [3]:
retrieval_ranker = SentenceTransformerRanker()

In [4]:
enc_dict = pickle.load(open("enc_dict.pickle", "rb"))
clf = pickle.load(open("rf_classifier.pickle", "rb"))

# First Evaluation Circle

In [5]:
queries_baseline = load_queries("../../data", "baseline", "test")
queries_perspective = load_queries("../../data", "perspective", "test")
corpus = load_corpus("../../data")

In [7]:
corpus['FleschReadingEase'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['readability grades']['FleschReadingEase'])
corpus['GunningFogIndex'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['readability grades']['GunningFogIndex'])

corpus['characters_per_word'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['characters_per_word'])
corpus['words_per_sentence'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['words_per_sentence'])
corpus['type_token_ratio'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['type_token_ratio'])
corpus['long_words'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['long_words'])
corpus['complex_words'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['complex_words'])

# add column with pos tags (as dict)
corpus['POS'] = corpus['argument'].apply(lambda x: pos_features(x))
# pos dict to single columns
corpus = pd.concat([corpus, corpus['POS'].apply(pd.Series)], axis=1)
corpus = corpus.drop('POS', axis=1)

# add column with entity ratio
corpus['Entities'] = corpus['argument'].apply(lambda x: entity_feature(x))

# add column with morphology features
corpus['Morph'] = corpus['argument'].apply(lambda x: morph_features(x))
# morph dict to single columns
corpus = pd.concat([corpus, corpus['Morph'].apply(pd.Series)], axis=1)
corpus = corpus.drop('Morph', axis=1)

corpus['stance_num'] = corpus.stance.apply(lambda x: 1 if x == "FAVOR" else 0)

In [8]:
retrieval_ranker.fit_trainsform(corpus["argument"].values)

Batches:   0%|          | 0/1013 [00:00<?, ?it/s]

## Scenario 1: Baseline

In [9]:
candidates = retrieval_ranker.rank(queries_baseline["text"].values, top_k=200)
predictions_baseline = process_predictions(corpus, queries_baseline, candidates)
pd.DataFrame(predictions_baseline).to_json("../../data/submission/final_submissions/predictions_testset1_baseline.jsonl", orient="records", lines=True)

## Scenario 2: Explicit Perspectivism

In [10]:
preselected_per_attribute = {}
for attribute in ['gender', 'age',
       'residence', 'civil_status', 'denomination', 'education',
       'political_spectrum']:
    preselected_per_attribute[attribute] = {}
    for key in corpus[attribute].unique():
        preselected_per_attribute[attribute][key] = [i for i in corpus[corpus[attribute] == key].index]
vals = []
for i, row in corpus.iterrows():
    vals += row["important_political_issues"]
vals = list(set(vals))
preselected_per_attribute["important_political_issue"] = {}
for val in vals:
    preselected_per_attribute["important_political_issue"][val] = [i for i, row in corpus.iterrows() if val in row["important_political_issues"]]

In [11]:
pred = []
for i, query in queries_perspective.iterrows():
    attribute = list(query["demographic_property"].keys())[0]
    key = list(query["demographic_property"].values())[0]
    if attribute == "age_bin":
        attribute = "age"
    pred += retrieval_ranker.rank([query["text"]], preselected_ids=preselected_per_attribute[attribute][key], top_k=200)

In [12]:
preselected = process_predictions(corpus, queries_perspective, pred)

In [17]:
final_preds = []
for i in tqdm(range(len(preselected))):
    attribute = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.keys())[0]).iloc[0]
    val = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.values())[0]).iloc[0]
    encoding = enc_dict[attribute][val]
    relevant_candidates = []
    for argument_id in preselected[i]["relevant_candidates"]:
        additional_features = list(corpus[corpus["argument_id"] == argument_id].reset_index().iloc[0][['FleschReadingEase', 'GunningFogIndex', 'characters_per_word', 'words_per_sentence', 'type_token_ratio', 'long_words', 'complex_words', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'Entities', 'past_tense', 'imperative', 'first_person', 'stance_num']])
        features = encoding + additional_features
        classification_result = clf.predict([features])[0]
        if classification_result == 1:
            relevant_candidates.append(argument_id)
    final_preds.append({"query_id":preselected[i]["query_id"], "relevant_candidates":relevant_candidates})

  0%|          | 0/2358 [00:00<?, ?it/s]

In [19]:
pd.DataFrame(final_preds).to_json("../../data/submission/final_submissions/predictions_testset1_explicit.jsonl", orient="records", lines=True)

## Scenario 3: Implicit Perspectivism

In [20]:
candidates = retrieval_ranker.rank(queries_perspective["text"].values, top_k=200)
preselected = process_predictions(corpus, queries_perspective, candidates)

In [21]:
final_preds = []
for i in tqdm(range(len(preselected))):
    attribute = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.keys())[0]).iloc[0]
    val = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.values())[0]).iloc[0]
    encoding = enc_dict[attribute][val]
    relevant_candidates = []
    for argument_id in preselected[i]["relevant_candidates"]:
        additional_features = list(corpus[corpus["argument_id"] == argument_id].reset_index().iloc[0][['FleschReadingEase', 'GunningFogIndex', 'characters_per_word', 'words_per_sentence', 'type_token_ratio', 'long_words', 'complex_words', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'Entities', 'past_tense', 'imperative', 'first_person', 'stance_num']])
        features = encoding + additional_features
        classification_result = clf.predict([features])[0]
        if classification_result == 1:
            relevant_candidates.append(argument_id)
    final_preds.append({"query_id":preselected[i]["query_id"], "relevant_candidates":relevant_candidates})

  0%|          | 0/2358 [00:00<?, ?it/s]

In [22]:
pd.DataFrame(final_preds).to_json("../../data/submission/final_submissions/predictions_testset1_implicit.jsonl", orient="records", lines=True)

# Second Evaluation Circle

In [23]:
queries_baseline = load_queries("../../data/test_cycle_2", "baseline", "test")
queries_perspective = load_queries("../../data/test_cycle_2", "perspective", "test")
corpus = load_corpus("../../data/test_cycle_2")

In [24]:
corpus['FleschReadingEase'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['readability grades']['FleschReadingEase'])
corpus['GunningFogIndex'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['readability grades']['GunningFogIndex'])

corpus['characters_per_word'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['characters_per_word'])
corpus['words_per_sentence'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['words_per_sentence'])
corpus['type_token_ratio'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['type_token_ratio'])
corpus['long_words'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['long_words'])
corpus['complex_words'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['complex_words'])

# add column with pos tags (as dict)
corpus['POS'] = corpus['argument'].apply(lambda x: pos_features(x))
# pos dict to single columns
corpus = pd.concat([corpus, corpus['POS'].apply(pd.Series)], axis=1)
corpus = corpus.drop('POS', axis=1)

# add column with entity ratio
corpus['Entities'] = corpus['argument'].apply(lambda x: entity_feature(x))

# add column with morphology features
corpus['Morph'] = corpus['argument'].apply(lambda x: morph_features(x))
# morph dict to single columns
corpus = pd.concat([corpus, corpus['Morph'].apply(pd.Series)], axis=1)
corpus = corpus.drop('Morph', axis=1)

corpus['stance_num'] = corpus.stance.apply(lambda x: 1 if x == "FAVOR" else 0)

In [25]:
retrieval_ranker.fit_trainsform(corpus["argument"].values)

Batches:   0%|          | 0/1222 [00:00<?, ?it/s]

## Scenario 1: Baseline

In [26]:
candidates = retrieval_ranker.rank(queries_baseline["text"].values, top_k=200)
predictions_baseline = process_predictions(corpus, queries_baseline, candidates)
pd.DataFrame(predictions_baseline).to_json("../../data/submission/final_submissions/predictions_testset2_baseline.jsonl", orient="records", lines=True)

## Scenario 2: Explicit Perspectivism

In [27]:
preselected_per_attribute = {}
for attribute in ['gender', 'age',
       'residence', 'civil_status', 'denomination', 'education',
       'political_spectrum']:
    preselected_per_attribute[attribute] = {}
    for key in corpus[attribute].unique():
        preselected_per_attribute[attribute][key] = [i for i in corpus[corpus[attribute] == key].index]
vals = []
for i, row in corpus.iterrows():
    vals += row["important_political_issues"]
vals = list(set(vals))
preselected_per_attribute["important_political_issue"] = {}
for val in vals:
    preselected_per_attribute["important_political_issue"][val] = [i for i, row in corpus.iterrows() if val in row["important_political_issues"]]
    
pred = []
for i, query in queries_perspective.iterrows():
    attribute = list(query["demographic_property"].keys())[0]
    key = list(query["demographic_property"].values())[0]
    if attribute == "age_bin":
        attribute = "age"
    pred += retrieval_ranker.rank([query["text"]], preselected_ids=preselected_per_attribute[attribute][key], top_k=200)
    
preselected = process_predictions(corpus, queries_perspective, pred)

final_preds = []
for i in tqdm(range(len(preselected))):
    attribute = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.keys())[0]).iloc[0]
    val = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.values())[0]).iloc[0]
    encoding = enc_dict[attribute][val]
    relevant_candidates = []
    for argument_id in preselected[i]["relevant_candidates"]:
        additional_features = list(corpus[corpus["argument_id"] == argument_id].reset_index().iloc[0][['FleschReadingEase', 'GunningFogIndex', 'characters_per_word', 'words_per_sentence', 'type_token_ratio', 'long_words', 'complex_words', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'Entities', 'past_tense', 'imperative', 'first_person', 'stance_num']])
        features = encoding + additional_features
        classification_result = clf.predict([features])[0]
        if classification_result == 1:
            relevant_candidates.append(argument_id)
    final_preds.append({"query_id":preselected[i]["query_id"], "relevant_candidates":relevant_candidates})

pd.DataFrame(final_preds).to_json("../../data/submission/final_submissions/predictions_testset2_explicit.jsonl", orient="records", lines=True)

  0%|          | 0/1782 [00:00<?, ?it/s]

## Scenario 3: Implicit Perspectism

In [28]:
candidates = retrieval_ranker.rank(queries_perspective["text"].values, top_k=200)
preselected = process_predictions(corpus, queries_perspective, candidates)

final_preds = []
for i in tqdm(range(len(preselected))):
    attribute = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.keys())[0]).iloc[0]
    val = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.values())[0]).iloc[0]
    encoding = enc_dict[attribute][val]
    relevant_candidates = []
    for argument_id in preselected[i]["relevant_candidates"]:
        additional_features = list(corpus[corpus["argument_id"] == argument_id].reset_index().iloc[0][['FleschReadingEase', 'GunningFogIndex', 'characters_per_word', 'words_per_sentence', 'type_token_ratio', 'long_words', 'complex_words', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'Entities', 'past_tense', 'imperative', 'first_person', 'stance_num']])
        features = encoding + additional_features
        classification_result = clf.predict([features])[0]
        if classification_result == 1:
            relevant_candidates.append(argument_id)
    final_preds.append({"query_id":preselected[i]["query_id"], "relevant_candidates":relevant_candidates})
    
pd.DataFrame(final_preds).to_json("../../data/submission/final_submissions/predictions_testset2_implicit.jsonl", orient="records", lines=True)

  0%|          | 0/1782 [00:00<?, ?it/s]

# Final Evaluation Circle

In [29]:
queries_baseline = load_queries("../../data/2023-surprise-data", "baseline", "test")
queries_perspective = load_queries("../../data/2023-surprise-data", "perspective", "test")
corpus = load_corpus("../../data/2023-surprise-data")

In [30]:
corpus['FleschReadingEase'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['readability grades']['FleschReadingEase'])
corpus['GunningFogIndex'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['readability grades']['GunningFogIndex'])

corpus['characters_per_word'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['characters_per_word'])
corpus['words_per_sentence'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['words_per_sentence'])
corpus['type_token_ratio'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['type_token_ratio'])
corpus['long_words'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['long_words'])
corpus['complex_words'] = corpus['argument'].apply(lambda x: readability.getmeasures(x, lang='de')['sentence info']['complex_words'])

# add column with pos tags (as dict)
corpus['POS'] = corpus['argument'].apply(lambda x: pos_features(x))
# pos dict to single columns
corpus = pd.concat([corpus, corpus['POS'].apply(pd.Series)], axis=1)
corpus = corpus.drop('POS', axis=1)

# add column with entity ratio
corpus['Entities'] = corpus['argument'].apply(lambda x: entity_feature(x))

# add column with morphology features
corpus['Morph'] = corpus['argument'].apply(lambda x: morph_features(x))
# morph dict to single columns
corpus = pd.concat([corpus, corpus['Morph'].apply(pd.Series)], axis=1)
corpus = corpus.drop('Morph', axis=1)

corpus['stance_num'] = corpus.stance.apply(lambda x: 1 if x == "FAVOR" else 0)

In [31]:
retrieval_ranker.fit_trainsform(corpus["argument"].values)

Batches:   0%|          | 0/897 [00:00<?, ?it/s]

## Scenario 1: Baseline

In [32]:
candidates = retrieval_ranker.rank(queries_baseline["text"].values, top_k=200)
predictions_baseline = process_predictions(corpus, queries_baseline, candidates)
pd.DataFrame(predictions_baseline).to_json("../../data/submission/final_submissions/predictions_testset3_baseline.jsonl", orient="records", lines=True)

## Scenario 2: Explicit

In [33]:
preselected_per_attribute = {}
for attribute in ['gender', 'age',
       'residence', 'civil_status', 'denomination', 'education',
       'political_spectrum']:
    preselected_per_attribute[attribute] = {}
    for key in corpus[attribute].unique():
        preselected_per_attribute[attribute][key] = [i for i in corpus[corpus[attribute] == key].index]
vals = []
for i, row in corpus.iterrows():
    vals += row["important_political_issues"]
vals = list(set(vals))
preselected_per_attribute["important_political_issue"] = {}
for val in vals:
    preselected_per_attribute["important_political_issue"][val] = [i for i, row in corpus.iterrows() if val in row["important_political_issues"]]
    
pred = []
for i, query in queries_perspective.iterrows():
    attribute = list(query["demographic_property"].keys())[0]
    key = list(query["demographic_property"].values())[0]
    if attribute == "age_bin":
        attribute = "age"
    pred += retrieval_ranker.rank([query["text"]], preselected_ids=preselected_per_attribute[attribute][key], top_k=200)
    
preselected = process_predictions(corpus, queries_perspective, pred)

final_preds = []
for i in tqdm(range(len(preselected))):
    attribute = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.keys())[0]).iloc[0]
    val = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.values())[0]).iloc[0]
    encoding = enc_dict[attribute][val]
    relevant_candidates = []
    for argument_id in preselected[i]["relevant_candidates"]:
        additional_features = list(corpus[corpus["argument_id"] == argument_id].reset_index().iloc[0][['FleschReadingEase', 'GunningFogIndex', 'characters_per_word', 'words_per_sentence', 'type_token_ratio', 'long_words', 'complex_words', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'Entities', 'past_tense', 'imperative', 'first_person', 'stance_num']])
        features = encoding + additional_features
        classification_result = clf.predict([features])[0]
        if classification_result == 1:
            relevant_candidates.append(argument_id)
    final_preds.append({"query_id":preselected[i]["query_id"], "relevant_candidates":relevant_candidates})

pd.DataFrame(final_preds).to_json("../../data/submission/final_submissions/predictions_testset3_explicit.jsonl", orient="records", lines=True)

  0%|          | 0/729 [00:00<?, ?it/s]

## Scenario 3: Implicit Perspectivism

In [34]:
candidates = retrieval_ranker.rank(queries_perspective["text"].values, top_k=200)
preselected = process_predictions(corpus, queries_perspective, candidates)

final_preds = []
for i in tqdm(range(len(preselected))):
    attribute = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.keys())[0]).iloc[0]
    val = queries_perspective[queries_perspective["query_id"]==preselected[i]["query_id"]]["demographic_property"].apply(lambda x: list(x.values())[0]).iloc[0]
    encoding = enc_dict[attribute][val]
    relevant_candidates = []
    for argument_id in preselected[i]["relevant_candidates"]:
        additional_features = list(corpus[corpus["argument_id"] == argument_id].reset_index().iloc[0][['FleschReadingEase', 'GunningFogIndex', 'characters_per_word', 'words_per_sentence', 'type_token_ratio', 'long_words', 'complex_words', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'Entities', 'past_tense', 'imperative', 'first_person', 'stance_num']])
        features = encoding + additional_features
        classification_result = clf.predict([features])[0]
        if classification_result == 1:
            relevant_candidates.append(argument_id)
    final_preds.append({"query_id":preselected[i]["query_id"], "relevant_candidates":relevant_candidates})
    
pd.DataFrame(final_preds).to_json("../../data/submission/final_submissions/predictions_testset3_implicit.jsonl", orient="records", lines=True)

  0%|          | 0/729 [00:00<?, ?it/s]