# Data Processing

## Import packages

In [42]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from argsum import get_quality_scores
from collections import defaultdict
from os import walk

[nltk_data] Downloading package punkt to /Users/moritz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## ArgKP21

In [None]:
# Load dataset
ArgKP21 = pd.read_csv('data/ArgKP-2021/dataset.csv')

##############################################
### Add train / dev / test set information ###
##############################################

# Add train/dev/test split information based in KPA 2021 Shared Task data
topics_train = pd.read_csv('data/KPA_2021_shared_task/kpm_data/arguments_train.csv').topic.unique()
topics_dev = pd.read_csv('data/KPA_2021_shared_task/kpm_data/arguments_dev.csv').topic.unique()
topics_test = pd.read_csv('data/KPA_2021_shared_task/test_data/arguments_test.csv').topic.unique()
conditions = [
    ArgKP21.topic.isin(topics_train),
    ArgKP21.topic.isin(topics_dev),
    ArgKP21.topic.isin(topics_test)
    ]
values = ['train', 'dev', 'test']
ArgKP21['set'] = np.select(conditions, values)

##############################################
### Add IBM Project Debater's Quality Scores #
##############################################

quality_scores = get_quality_scores(model = 'debater_api',
                                    arguments = ArgKP21['argument'].to_list(), 
                                    topic = ArgKP21['topic'].to_list(),
                                    sleep_time = 0,
                                    n = len(ArgKP21['argument'].to_list()))

ArgKP21.insert(len(ArgKP21.columns), 'ibm_pro_deb_qs', quality_scores)

# Save the processed dataset
ArgKP21.to_csv('data/ArgKP-2021/dataset_splits_scores.csv', index = False)

In [None]:
# Load dataset
ArgKP21 = pd.read_csv('data/ArgKP-2021/dataset_splits_scores.csv')

##############################################
### Remove arguments that do not have ########
### exactly one matching key point or that ###
### consists of more than one sentence #######
##############################################

ArgKP21_splits_processed = pd.DataFrame(columns = ArgKP21.columns)
punkt_sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for arg in ArgKP21['argument'].unique():
    sentences = punkt_sentence_tokenizer.tokenize(arg)
    if len(sentences) == 1:
        data = ArgKP21[ArgKP21['argument'] == arg]
        if data['label'].sum() == 1:
            ArgKP21_splits_processed = pd.concat([ArgKP21_splits_processed, data[data['label'] == 1]], ignore_index = True)
ArgKP21_splits_processed.to_csv('data/ArgKP-2021/dataset_splits_scores_processed.csv', index = False)

## ArgKP23

In [None]:
# Load dataset
ArgKP23 = pd.read_csv('data/ArgKP-2023/dataset.csv')

##############################################
### Add IBM Project Debater's Quality Scores #
##############################################

quality_scores = get_quality_scores(model = 'debater_api',
                                    arguments = ArgKP23['argument'].to_list(), 
                                    topic = ArgKP23['topic'].to_list(),
                                    sleep_time = 0,
                                    n = len(ArgKP23['argument'].to_list()))

ArgKP23.insert(len(ArgKP23.columns), 'ibm_pro_deb_qs', quality_scores)

# Save the processed dataset
ArgKP23.to_csv('data/ArgKP-2023/dataset_scores.csv', index = False)

ArgumentQualityClient:   0%|          | 0/9281 [00:00<?, ?it/s]

ArgumentQualityClient: 100%|██████████| 9281/9281 [01:49<00:00, 84.62it/s] 


## ArgQ

In [None]:
# Load dataset
ArgQ = pd.read_csv('data/IBM-ArgQ-Rank-30kArgs/dataset.csv')

##############################################
### Add IBM Project Debater's Quality Scores #
##############################################

quality_scores = get_quality_scores(model = 'debater_api',
                                    arguments = ArgQ['argument'].to_list(), 
                                    topic = ArgQ['topic'].to_list(),
                                    sleep_time = 0,
                                    n = len(ArgQ['argument'].to_list()))

ArgQ.insert(len(ArgQ.columns), 'ibm_pro_deb_qs', quality_scores)

# Save the processed dataset
ArgQ.to_csv('data/IBM-ArgQ-Rank-30kArgs/dataset_scores.csv', index = False)

ArgumentQualityClient: 100%|██████████| 30497/30497 [06:07<00:00, 83.04it/s]


## Nuclear Energy (Summetix)

In [None]:
# Load dataset
nuclear_energy = pd.read_csv('data/Summetix/NuclearEnergy_clusteringEval.tsv', sep = '\t')
topic = 'Nuclear Energy'

##############################################
### Add IBM Project Debater's Quality Scores #
##############################################

quality_scores = get_quality_scores(model = 'debater_api',
                                    arguments = nuclear_energy['argument'].to_list(), 
                                    topic = topic,
                                    sleep_time = 0,
                                    n = len(nuclear_energy['argument'].to_list()))

nuclear_energy.insert(len(nuclear_energy.columns), 'ibm_pro_deb_qs', quality_scores)

# Save the processed dataset
nuclear_energy.to_csv('data/Summetix/NuclearEnergy_clusteringEval_scores.csv', index = False)

ArgumentQualityClient: 100%|██████████| 339/339 [00:02<00:00, 126.16it/s]


## Debatepedia (Summetix)

In [None]:
# Load dataset
debatepedia = pd.read_csv('data/Summetix/debatepedia_processed_9Topics.tsv', sep = '\t')

##############################################
### Add IBM Project Debater's Quality Scores #
##############################################

quality_scores = get_quality_scores(model = 'debater_api',
                                    arguments = debatepedia['argument'].to_list(), 
                                    topic = debatepedia['topic'].to_list(),
                                    sleep_time = 0,
                                    n = len(debatepedia['argument'].to_list()))

debatepedia.insert(len(debatepedia.columns), 'ibm_pro_deb_qs', quality_scores)

# Save the processed dataset
debatepedia.to_csv('data/Summetix/debatepedia_processed_9Topics_scores_topic.csv', index = False)

ArgumentQualityClient: 100%|██████████| 1804/1804 [00:20<00:00, 87.43it/s]


In [None]:
# Load dataset
debatepedia = pd.read_csv('data/Summetix/debatepedia_processed_9Topics.tsv', sep = '\t')

##############################################
### Add IBM Project Debater's Quality Scores #
##############################################

quality_scores = get_quality_scores(model = 'debater_api',
                                    arguments = debatepedia['argument'].to_list(), 
                                    topic = debatepedia['sub_topic'].to_list(),
                                    sleep_time = 0,
                                    n = len(debatepedia['argument'].to_list()))

debatepedia.insert(len(debatepedia.columns), 'ibm_pro_deb_qs', quality_scores)

# Save the processed dataset
debatepedia.to_csv('data/Summetix/debatepedia_processed_9Topics_scores_sub_topic.csv', index = False)

ArgumentQualityClient: 100%|██████████| 1804/1804 [00:20<00:00, 87.38it/s]


In [None]:
# Load dataset
debatepedia = pd.read_csv('data/Summetix/debatepedia_processed_9Topics.tsv', sep = '\t')
topic = [debatepedia['topic'].to_list()[i] + ' ' + debatepedia['sub_topic'].to_list()[i] for i in range(len(debatepedia))]

##############################################
### Add IBM Project Debater's Quality Scores #
##############################################

quality_scores = get_quality_scores(model = 'debater_api',
                                    arguments = debatepedia['argument'].to_list(), 
                                    topic = topic,
                                    sleep_time = 0,
                                    n = len(debatepedia['argument'].to_list()))

debatepedia.insert(len(debatepedia.columns), 'ibm_pro_deb_qs', quality_scores)

# Save the processed dataset
debatepedia.to_csv('data/Summetix/debatepedia_processed_9Topics_scores_concatenation.csv', index = False)

ArgumentQualityClient: 100%|██████████| 1804/1804 [00:20<00:00, 86.86it/s]


## Debate dataset
https://aclanthology.org/D14-1083/

In [None]:
# Define functions to create a pandas dataframe
def removeprefix(str, prefix):
    if str.startswith(prefix):
        return str[len(prefix):]
    else:
        return str

def get_args_kp_by_topic_debate():
    path = 'data/Debate/reason/'
    folders = ['abortion', 'gayRights', 'marijuana', 'obama']

    arguments_by_topic = defaultdict(dict)

    for folder in folders:
        filenames = next(walk(path+folder), (None, None, []))[2]
        arguments_by_topic[folder] = {}
        args_by_label = defaultdict(list)
        for filename in filenames:
            with open(path+folder+'/'+filename, encoding='utf-8', errors='ignore') as f:
                label = None
                for line in f:
                    if 'Label##' in line:
                        label = removeprefix(line, 'Label##')
                        label = label.strip()
                    if 'Line##' in line:
                        text = removeprefix(line, 'Line##')
                        text = text.strip()
                        args_by_label[label].append(text)
        arguments_by_topic[folder] = args_by_label

    path = 'data/Debate/reason/labels/'
    for folder in folders:
        labels = []
        with open(path+folder+'.txt', encoding='utf-8', errors='ignore') as f:
            file_list = []
            for line in f:
                if line.strip(): file_list.append(line.strip())
            for i in range(int(len(file_list)/2)):
                if file_list[i*2] == 'p-other':
                    arguments_by_topic[folder]['Pro Other'] = arguments_by_topic[folder].pop('p-other')
                elif file_list[i*2] == 'p-Other':
                    arguments_by_topic[folder]['Pro Other'] = arguments_by_topic[folder].pop('p-Other')
                elif file_list[i*2] == 'c-other':
                    arguments_by_topic[folder]['Con Other'] = arguments_by_topic[folder].pop('c-other')
                elif file_list[i*2] == 'c-Other':
                    arguments_by_topic[folder]['Con Other'] = arguments_by_topic[folder].pop('c-Other')
                else:
                    arguments_by_topic[folder][file_list[i*2+1]] = arguments_by_topic[folder].pop(file_list[i*2])
                    
    for folder in folders:
        arguments_by_topic[folder].pop('Pro Other')
        arguments_by_topic[folder].pop('Con Other')
    
    return arguments_by_topic

In [None]:
# Extract summaries with corresponding stances
path = 'data/Debate/reason/labels/'
folders = ['abortion', 'gayRights', 'marijuana', 'obama']
summaries = []
stances = []
stance_dict = {'p':1,'c':-1}
for folder in folders:
    with open(path+folder+'.txt', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            if line[0:2] in ['p-', 'c-']:
                stances.append(stance_dict[line[0:1]])
            elif line[0:2] not in ['p-', 'c-', '\n']:
                summaries.append(line.strip())  
summaries_with_stances = dict(zip(summaries, stances))
summaries_with_stances.pop('Others')

-1

In [None]:
# Load dataset
debate = get_args_kp_by_topic_debate()

debate_df = pd.DataFrame(columns = ['topic', 'argument', 'summary'])
for topic in list(debate.keys()):
    summaries = list(debate[topic].keys())
    for summary in summaries:
        arguments = debate[topic][summary]
        debate_df = pd.concat([debate_df, pd.DataFrame({'topic':[topic for i in range(len(arguments))],
                                                        'argument': arguments,
                                                        'summary': [summary for i in range(len(arguments))],
                                                        'stance': [summaries_with_stances[summary] for i in range(len(arguments))]})], ignore_index = True)
#debate_df.insert(loc = len(debate_df.columns), column = 'stance', value = [0 for i in range(len(debate_df))])

##############################################
### Add IBM Project Debater's Quality Scores #
##############################################

quality_scores = get_quality_scores(model = 'debater_api',
                                    arguments = debate_df['argument'].to_list(), 
                                    topic = debate_df['topic'].to_list(),
                                    sleep_time = 0,
                                    n = len(debate_df['argument'].to_list()))

debate_df.insert(len(debate_df.columns), 'ibm_pro_deb_qs', quality_scores)
debate_df = debate_df.astype({'stance':int})

# Save the processed dataset
debate_df.to_csv('data/Debate/dataset_scores.csv', index = False)

ArgumentQualityClient: 100%|██████████| 3228/3228 [00:37<00:00, 87.24it/s] 


In [None]:
# Load dataset
debate_scores = pd.read_csv('data/Debate/dataset_scores.csv')

##############################################
### Remove arguments that do not have ########
### exactly one matching key point or that ###
### consists of more than one sentence #######
##############################################

debate_scores_processed = pd.DataFrame(columns = debate_scores.columns)
punkt_sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for arg in debate_scores['argument'].unique():
    sentences = punkt_sentence_tokenizer.tokenize(arg)
    if len(sentences) == 1:
        data = debate_scores[debate_scores['argument'] == arg]
        if len(data) == 1:
            debate_scores_processed = pd.concat([debate_scores_processed, data], ignore_index = True)
debate_scores_processed.to_csv('data/Debate/dataset_scores_processed.csv', index = False)

In [None]:
# Load dataset
Debate_processed = pd.read_csv('data/Debate/dataset_scores_processed.csv')
Debate_processed_test = pd.DataFrame()

summaries = Debate_processed['summary'].unique()
for sum in summaries:
    mask = (Debate_processed['summary'] == sum)
    Debate_processed_test = pd.concat([Debate_processed_test, Debate_processed[mask].sample(frac = 0.5, random_state = 3845)])
Debate_processed_test.to_csv('data/Debate/dataset_scores_processed_test.csv')