In [9]:
from collections import defaultdict
import xmltodict, csv, os, rstparse,json 
import pandas as pd
from sklearn.model_selection import KFold


In [3]:
data = open('data/comarg/UGIP.xml', 'r').read()
data = xmltodict.parse(data)

FileNotFoundError: [Errno 2] No such file or directory: 'data/comarg/UGIP.xml'

In [20]:
with open('UGIP_structured.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'comment_text', 'argument_text', 'label']
    
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()

    for comarg in data['document']['unit']:
        com_id = comarg['@id']  # The full ID from the 'unit' element
        comment_text = comarg['comment']['text']  # Comment text inside <comment><text>
        argument_text = comarg['argument']['text']  # Argument text inside <argument><text>
        label = comarg['label']  # Label inside <label>

        writer.writerow({
            'id': com_id,
            'comment_text': comment_text,
            'argument_text': argument_text,
            'label': label
        })

In [43]:
folder = '../data/yru/reason/marijuana'
output_file = 'yru_marijuana.csv'

comment_counter = 1
unique_comments = {}

with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    
    csvwriter.writerow(['id', 'text', 'label', 'line'])
    
    for file_name in os.listdir(folder):
        if file_name.endswith('.rsn'):
            file_path = os.path.join(folder, file_name)
            rst = rstparse.Parser()
            
            text = ''
            labels = []
            lines = []

            with open(file_path, 'r', encoding='latin-1') as file:
                rst.read(file)
            rst.parse()

            for line in rst.lines:
                if not line.startswith('Label##') and not line.startswith('Line##'):
                    text += line + ' ' 

                elif line.startswith('Label##'):
                    labels.append(line.replace('Label##', '').strip())

                elif line.startswith('Line##'):
                    lines.append(line.replace('Line##', '').strip())

            cleaned_text = text.strip()

            if cleaned_text not in unique_comments:
                unique_comments[cleaned_text] = f'ma{comment_counter}'
                comment_counter += 1
            
            comment_id = unique_comments[cleaned_text]

            for label, line in zip(labels, lines):
                csvwriter.writerow([comment_id, cleaned_text, label, line])

## Add missing labels to GM and UGPI

## Get 0s in YRU corpora

In [4]:
topic_labels = {
    "abortion": [
        "p-right", "p-rape", "p-not_human", "p-mother_danger", "p-baby_ill_treatment", 
        "p-birth_ctrl", "p-not_murder", "p-sick_mom", "p-other",
        "c-adopt", "c-kill", "c-baby_right", "c-sex", "c-bad_4_mom", "c-other"
    ],
    "gayRights": [
        "p-normal", "p-right_denied", "p-no_threat_for_child", "p-born", "p-religion", 
        "p-Other", "c-religion", "c-abnormal", "c-threat_to_child", "c-gay_problems", "c-Other"
    ],
    "obama": [
        "p-economy", "p-War", "p-republicans", "p-decision_policies", "p-quality", 
        "p-health", "p-foreign_policies", "p-job", "p-Other",
        "c-economy", "c-War", "c-job", "c-health", "c-decision_policies", 
        "c-republicans", "c-quality", "c-foreign_policies", "c-Other"
    ],
    "marijuana": [
        "p-not_addictive", "p-medicine", "p-legal", "p-right", "p-no_damage", 
        "p-Other", "c-health", "c-mind", "c-illegal", "c-crime", "c-addiction", "c-Other"
    ]
} 

In [43]:
def find_missing_labels(input_file, topic):

    df = pd.read_csv(input_file)
    
    new_rows = []
    
    if topic not in topic_labels:
        raise ValueError(f"Topic '{topic}' not found in topic_labels.")
    
    for _, group in df.groupby(['id', 'text']):
        comment_id = group['id'].iloc[0]
        comment_text = group['text'].iloc[0]

        # Get existing labels for the current comment
        existing_labels = set(group['label'])

        # Iterate over all labels for the given topic
        for label in topic_labels[topic]:
            if label in existing_labels:
                # If the label is already present, mark it as 1
                new_rows.append({
                    'id': comment_id,
                    'text': comment_text,
                    'label': label,
                    'present': 1
                })
            else:
                # If the label is missing, add it with present = 0
                new_rows.append({
                    'id': comment_id,
                    'text': comment_text,
                    'label': label,
                    'present': 0
                })

    
    missing_labels_df = pd.DataFrame(new_rows)
    print(missing_labels_df)


    output_file = f'yru_{topic}_with_negatives.csv' 
    missing_labels_df.to_csv(output_file, index=False)


In [47]:
input_file = '/Users/guida/llm_argument_tasks/clean_data/yru_obama.csv'  
topic = 'obama' 

result_df = find_missing_labels(input_file, topic)

         id                                               text  \
0      oba1  Rave, you seem to know about as much about the...   
1      oba1  Rave, you seem to know about as much about the...   
2      oba1  Rave, you seem to know about as much about the...   
3      oba1  Rave, you seem to know about as much about the...   
4      oba1  Rave, you seem to know about as much about the...   
...     ...                                                ...   
7915  oba99  War in the Middle East.  so your mad that he's...   
7916  oba99  War in the Middle East.  so your mad that he's...   
7917  oba99  War in the Middle East.  so your mad that he's...   
7918  oba99  War in the Middle East.  so your mad that he's...   
7919  oba99  War in the Middle East.  so your mad that he's...   

                    label  present  
0               p-economy        0  
1                   p-War        0  
2           p-republicans        0  
3     p-decision_policies        0  
4               p-qual

In [59]:
golden_df = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives.csv')
golden_df = golden_df[golden_df['present'] == 0]
print(golden_df)
argument_labels = golden_df['label'].tolist() 
print(argument_labels)
with open('/Users/guida/llm_argument_tasks/output_files/llama3/yru_abortion_identification_with_negatives.json', 'r') as f:
    predictions = json.load(f)
    print(predictions)

# Add argument labels to predictions
"""ArithmeticErrorfor i, pred in enumerate(predictions):
    pred['argument'] = argument_labels[i]
    print(pred)
# Save the updated predictions
with open('/Users/guida/llm_argument_tasks/output_files/llama3/yru_obama_identification.json', 'w') as f:
   json.dump(predictions, f, indent=2)"""

        id                                               text  \
1      ab1  Once again...  1. In all instances abortion is...   
3      ab1  Once again...  1. In all instances abortion is...   
4      ab1  Once again...  1. In all instances abortion is...   
5      ab1  Once again...  1. In all instances abortion is...   
6      ab1  Once again...  1. In all instances abortion is...   
...    ...                                                ...   
6684  ab99  Plants have systems which are alike our nervou...   
6685  ab99  Plants have systems which are alike our nervou...   
6687  ab99  Plants have systems which are alike our nervou...   
6688  ab99  Plants have systems which are alike our nervou...   
6689  ab99  Plants have systems which are alike our nervou...   

                     label  present  
1                   p-rape        0  
3          p-mother_danger        0  
4     p-baby_ill_treatment        0  
5             p-birth_ctrl        0  
6             p-not_murder   

"ArithmeticErrorfor i, pred in enumerate(predictions):\n    pred['argument'] = argument_labels[i]\n    print(pred)\n# Save the updated predictions\nwith open('/Users/guida/llm_argument_tasks/output_files/llama3/yru_obama_identification.json', 'w') as f:\n   json.dump(predictions, f, indent=2)"

## Clean data for Task 2- get labels 3 out

In [9]:
gm = pd.read_csv("GM_structured.csv")

gm = gm[gm['label'] != 3]
gm.to_csv("GM_structured_no_3.csv")

In [11]:
ugip = pd.read_csv("UGIP_structured.csv")

ugip = ugip[ugip['label'] != 3]
ugip.to_csv("UGIP_structured_no_3.csv")

In [None]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/zero-shot/comarg_gm_argument_identification.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

           id  label_gold  label_pred
0     100arg1           0           0
1     100arg2           0           0
2     100arg3           0           0
3     100arg4           1           1
4     100arg5           1           0
...       ...         ...         ...
1381    9arg3           0           0
1382    9arg4           0           1
1383    9arg5           1           0
1384    9arg6           1           1
1385    9arg7           0           0

[1386 rows x 3 columns]
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       950
           1       0.67      0.50      0.58       436

    accuracy                           0.77      1386
   macro avg       0.73      0.70      0.71      1386
weighted avg       0.76      0.77      0.76      1386



## UGIP

In [None]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/zero-shot/comarg_ugip_argument_identification.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.90      0.71      0.79      1778
           1       0.26      0.56      0.35       322

    accuracy                           0.68      2100
   macro avg       0.58      0.63      0.57      2100
weighted avg       0.80      0.68      0.72      2100



## Fine tuning BERT: preprocess COMARG and YRU with binary labels, one dataset

In [27]:
def map_labels(row):
    if row['label'] in [1, 2, 4, 5]:  # Check if label is in this list
        return 1
    elif row['label'] == 3:  # Check if label is 3
        return 0
    else:
        return row['label']

In [4]:
def map_labels_polarity(row):
    if row['label'] in [1, 2]:  # Map 1 and 2 to 1
        return 1
    elif row['label'] in [4, 5]:  # Map 4 and 5 to 5
        return 5
    else:
        return row['label']

In [31]:
yru_obama = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives.csv')
yru_marijuana = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives.csv')
yru_gayRights = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/yru_gayRights_with_negatives.csv')
yru_abortion = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives.csv')

UGIP_structured = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
GM_structured = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')

yru_data = pd.concat([yru_obama, yru_marijuana, yru_gayRights, yru_abortion], ignore_index=True)

comarg_data = pd.concat([UGIP_structured, GM_structured], ignore_index=True)

comarg_df_renamed = comarg_data.rename(columns={
    'id': 'id',
    'comment_text': 'text',
    'argument_text': 'argument',
    'label': 'label'
})

# Rename the columns of yru_df to match the structure of comarg_df
yru_df_renamed = yru_data.rename(columns={
    'id': 'id',
    'text': 'text',
    'label': 'argument',  # corresponds to argument_text in comarg_df
    'present': 'label'    # corresponds to label in comarg_df
})

combined_df = pd.concat([comarg_df_renamed, yru_df_renamed], ignore_index=True)
combined_df = combined_df[combined_df['label'] != 3]
combined_df['label'] = combined_df.apply(map_labels, axis=1)


In [32]:
combined_df.to_csv('task1_finetune_data_no_pol_without3.csv')

In [22]:
arguments_list = combined_df['argument'].unique().tolist()
arguments_list

['Removing under god would promote religious tolerance',
 'Separation of state and religion',
 'America is based on democracy and the pledge should reflect the belief of the American majority.',
 'Under God  is part of American tradition and history',
 'Implies ultimate power on the part of the state',
 'Likely to be seen as a state sanctioned condemnation of religion',
 'It is discriminatory to refuse gay couples the right to marry',
 'Marriage should be between a man and a woman',
 'Major world religions are against gay marriages',
 'Gay marriage undermines the institution of marriage, leading to an increase in out of wedlock births and divorce rates',
 'Marriage is about more than procreation, therefore gay couples should not be denied the right to marry due to their biology.',
 'Gay couples can declare their union without resort to marriage',
 'Gay couples should be able to take advantage of the fiscal and legal benefits of marriage',
 'p-economy',
 'p-War',
 'p-republicans',
 'p-d

In [14]:
gm_data = pd.read_csv('GM_structured_no_3.csv', index_col=0)
ugip_data = pd.read_csv('UGIP_structured_no_3.csv',index_col=0)

In [15]:
task2_pol = pd.concat([gm_data, ugip_data], ignore_index=True)
task2_pol.to_csv('task2_pol_finetune.csv', index=False)

                id                                       comment_text  \
0            1arg4  I am pro because I believe, not only should he...   
1            1arg7  I am pro because I believe, not only should he...   
2            2arg7  In answer to It is wrong to create fatherless ...   
3            3arg5  There is a mixed up message in culture today t...   
4            3arg7  There is a mixed up message in culture today t...   
..             ...                                                ...   
753  414721686arg1  Religion has nothing in common with the basis ...   
754  414721686arg2  Religion has nothing in common with the basis ...   
755  414721686arg3  Religion has nothing in common with the basis ...   
756  414721954arg1  We are a nation formed by men of faith, reflec...   
757  414721680arg1  We should follow the Constitution and keep Chu...   

                                         argument_text  label  
0    It is discriminatory to refuse gay couples the...     

In [16]:
task2_pol['label'] = task2_pol.apply(map_labels_polarity, axis=1)
task2_pol.to_csv('task2_nopol_finetune.csv', index=False)

## Task 3 Roberta fine tuning

In [3]:
yru_obama = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/yru_obama_main.csv')
yru_marijuana = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_main.csv')
yru_gayRights = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/yru_gayRights_main.csv')
yru_abortion = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/yru_abortion_main.csv')

yru_data = yru_data = pd.concat([yru_obama, yru_marijuana, yru_gayRights, yru_abortion], ignore_index=True)

In [7]:
yru_data

Unnamed: 0,id,text,label,line,uid
0,oba1,"Rave, you seem to know about as much about the...",p-health,"Rave, you seem to know about as much about the...",oba1arg0
1,oba10,You have to support Obama by default. Yes it i...,p-quality,Obama is a teachable intellectual who has the ...,oba10arg0
2,oba10,You have to support Obama by default. Yes it i...,p-republicans,You have to support Obama by default. Yes it i...,oba10arg1
3,oba100,Seriously? Bush raised the debt by two BILLION...,c-economy,Bush raised the debt by two BILLION for the wa...,oba100arg0
4,oba101,Did anyone know that or joke for a president h...,c-economy,How is spending more money going to help our d...,oba101arg0
...,...,...,...,...,...
2823,ab97,HELL NO! IF YOU WERE GROWN ENOUGH TO SPREAD YO...,c-bad_4_mom,YOU WILL HAVE TO LIVE WITH THE GUILT FOREVER!!...,ab97arg0
2824,ab97,HELL NO! IF YOU WERE GROWN ENOUGH TO SPREAD YO...,c-kill,KILLING A INNOCENT BABY ISN'T GONNA JUST GO AW...,ab97arg1
2825,ab97,HELL NO! IF YOU WERE GROWN ENOUGH TO SPREAD YO...,c-sex,IF YOU WERE GROWN ENOUGH TO SPREAD YOUR FUCKIN...,ab97arg2
2826,ab98,"Its not a child , its a fetus. It has no feeli...",p-not_human,"Its not a child , its a fetus. It has no feeli...",ab98arg0


In [70]:
qa_data =[]

for _, row in yru_data.iterrows():
    qa_data.append({
        'id': row['id'],  # Use the ID from the row
        'context': row['text'],  # The context of the comment
        'question': row['label'],  # The label as the question
        'answers': [
            {
                'text': row['line'],  # The answer (line)
                'answer_start': row['text'].find(row['line'])  # Calculate the start position of the answer
            }
        ]
    })
    
with open('task3_qa_data.json', 'w') as f:
    json.dump(qa_data, f, indent=2)

In [71]:
with open("task3_qa_data.json", "r") as read_file:
    data = json.load(read_file)

# Initialize an empty list to hold the reformatted data
formatted_data = []

# Loop through each example in the original data
for item in data:
    # Reformat each item into the desired structure
    context = item['context']
    qas = []
    
    # Assuming there's only one question and one answer per item
    question = item['question']
    answer = item['answers'][0]['text']
    answer_start = item['answers'][0]['answer_start']
    
    # Each Q&A is formatted into a dictionary
    qas.append({
        'id': item['id'],
        'is_impossible': False,  # Set this to False as per your model format
        'question': question,
        'answers': [{'text': answer, 'answer_start': answer_start}]
    })
    
    # Append the formatted example to the list
    formatted_data.append({
        'context': context,
        'qas': qas
    })

# Save the formatted data into a JSON file
with open('task2_formatted.json', 'w') as f:
    json.dump(formatted_data, f, indent=2)

In [64]:
for fold, (train_idx, test_idx) in enumerate(kf.split(data), 1):
    train_data = [data[i] for i in train_idx]
    test_data = [data[i] for i in test_idx]
    
    print(f"Fold {fold}: Train data size: {len(train_data)}, Test data size: {len(test_data)}")
    # Check the first few entries of train and test to ensure they are correct
    print("Train sample:", train_data[:2])  # Check a few training examples
    print("Test sample:", test_data[:2])    # Check a few testing examples


Fold 1: Train data size: 2262, Test data size: 566
Train sample: [{'id': 'oba1', 'context': "Rave, you seem to know about as much about the Healthcare Bill as you do grammar.  This bill will  decrease  the national debt by half a trillion over ten years. It's  good  for the economy.  You see, uninsured people currently only have the option of going to an emergency room when very sick. This bill insures the uninsured, meaning that hospital visit will no longer be taken care of by tax payers. Next, insured people are more likely to see a doctor, which in turn leads to people catching diseases like cancer before they are life-threatening, and before they cost hundreds of thousands to treat. Which again saves money.  Then there is the whole human life thing. We are the only Western nation without some kind of Universal Healthcare, and we have the worst overall healthcare in the western world and we pay the most for that crappy healthcare.  link   You sound very uninformed. Maybe you're you