In [1]:
import os
import sys

import pandas as pd
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_input.load_data import load_corpus, load_queries

For reproducing this, download the respective data set splits from https://github.com/Blubberli/argmin2024-perspective/

In [2]:
corpus1 = load_corpus("../../data_release_1_with_labels")
corpus2 = load_corpus("../../data_release_2_with_labels")
corpus3 = load_corpus("../../data_release_3_with_labels")
corpus = pd.concat([corpus1, corpus2, corpus3])

In [50]:
queries_train_1 = load_queries("../../data_release_1_with_labels/", "perspective", "train")
queries_train_2 = load_queries("../../data_release_2_with_labels", "perspective", "train")
queries_train_3 = load_queries("../../data_release_3_with_labels", "perspective", "train")
queries_train = pd.concat([queries_train_1, queries_train_2, queries_train_3]).drop_duplicates(subset="query_id").reset_index()
queries_dev_1 = load_queries("../../data_release_1_with_labels", "perspective", "dev")
queries_dev_2 = load_queries("../../data_release_2_with_labels", "perspective", "dev")
queries_dev_3 = load_queries("../../data_release_3_with_labels", "perspective", "dev")
queries_dev = pd.concat([queries_dev_1, queries_dev_2, queries_dev_3]).drop_duplicates(subset="query_id").reset_index()

queries_train_bl1 = load_queries("../../data_release_1_with_labels/", "baseline", "train")
queries_train_bl2 = load_queries("../../data_release_2_with_labels", "baseline", "train")
queries_train_bl3 = load_queries("../../data_release_3_with_labels", "baseline", "train")
queries_train_bl = pd.concat([queries_train_bl1, queries_train_bl2, queries_train_bl3]).drop_duplicates(subset="query_id").reset_index()
queries_dev_bl1 = load_queries("../../data_release_1_with_labels", "baseline", "dev")
queries_dev_bl2 = load_queries("../../data_release_2_with_labels", "baseline", "dev")
queries_dev_bl3 = load_queries("../../data_release_3_with_labels", "baseline", "dev")
queries_dev_bl = pd.concat([queries_dev_bl1, queries_dev_bl2, queries_dev_bl3]).drop_duplicates(subset="query_id").reset_index()

In [3]:
corpus = corpus.drop_duplicates(subset="argument_id")

In [4]:
attr_val = {}
all_vals = []
for attribute in ['gender', 'age',
       'residence', 'civil_status', 'denomination', 'education',
       'political_spectrum']:
    if attribute == 'age':
        attr_val["age_bin"] = list(corpus[f"{attribute}"].unique())
        all_vals += attr_val["age_bin"]
    else:
        attr_val[attribute] = list(corpus[f"{attribute}"].unique())
        all_vals += attr_val[attribute]
vals = []
for i, row in corpus.iterrows():
    vals += row["important_political_issues"]
vals = list(set(vals))
all_vals += vals
attr_val["important_political_issue"] = list(vals)

Building one-hot encoding for the sociodemographic groups

In [5]:
enc_dict = {}
i = 0
for attribute in attr_val.keys():
    enc_dict[attribute] = {}
    for value in attr_val[attribute]:
        enc = [0 for _ in range(len(all_vals))]
        enc[i] = 1
        i+=1
        enc_dict[attribute][value] = enc

In [7]:
pickle.dump(enc_dict, open("enc_dict.pickle", "wb"))

Load data from disk, for details how the data set was created, consult `gather_dataset.py` 

In [2]:
df = pd.read_csv("train.tsv", sep="\t")

In [2]:
df = pd.read_csv("val.tsv", sep="\t")

In [3]:
df = df.sample(frac=1) # shuffle dataset
# df = df.sample(frac=0.2)

Preprocessing features

In [4]:
df["encoding"] = df["encoding"].apply(lambda x: [int(elem) for elem in x.replace("[", "").replace("]", "").split(", ")])

In [5]:
df["additional_feats"] = df["additional_feats"].apply(lambda x: [float(elem) for elem in x.replace("[", "").replace("]", "").split(", ")])

In [6]:
df["input"] = df["encoding"] + df["additional_feats"]

In [7]:
train, validate, test = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

In [9]:
clf = RandomForestClassifier()

In [10]:
clf.fit(list(train["input"].values), list(train["label"].values))

In [17]:
pickle.dump(clf, open("rf_classifier.pickle", "wb"))

In [7]:
clf = pickle.load(open("rf_classifier.pickle", "rb"))

In [14]:
pred = clf.predict(list(test["input"].values))

In [8]:
pred = clf.predict(list(df["input"].values))

In [21]:
metrics.classification_report(pred, list(test["label"].values)) # result using only 1/5 of the data

'              precision    recall  f1-score   support\n\n           0       0.60      0.60      0.60    219702\n           1       0.64      0.64      0.64    243636\n\n    accuracy                           0.62    463338\n   macro avg       0.62      0.62      0.62    463338\nweighted avg       0.62      0.62      0.62    463338\n'

In [15]:
metrics.classification_report(pred, list(test["label"].values)) # result using the full data

'              precision    recall  f1-score   support\n\n           0       0.63      0.57      0.60   1191113\n           1       0.59      0.64      0.61   1125574\n\n    accuracy                           0.60   2316687\n   macro avg       0.61      0.61      0.60   2316687\nweighted avg       0.61      0.60      0.60   2316687\n'

In [10]:
metrics.classification_report(pred, list(df["label"].values)) # result using the val dataset

'              precision    recall  f1-score   support\n\n           0       0.01      0.47      0.01     20823\n           1       0.99      0.53      0.69   2949123\n\n    accuracy                           0.53   2969946\n   macro avg       0.50      0.50      0.35   2969946\nweighted avg       0.99      0.53      0.69   2969946\n'