In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from openai import OpenAI
import os
import re
import time
import csv
from collections import Counter

In [2]:
def filter_dataframe_l(df, word_list):
    # Constructing the dynamic condition based on the list of words
    condition = df['Text'].apply(lambda x: all(re.search(re.escape(word), x, re.IGNORECASE) for word in word_list))

    # Applying the condition to filter the DataFrame
    filtered_df = df[condition]

    return filtered_df

In [3]:
def process_text(text):
    # Characters to check for
    punctuation_chars = [',', '"', "'"]

    # Iterate through each character in the text
    processed_text = ""
    for char in text:
        # Add spaces around specified punctuation characters
        if char in punctuation_chars:
            processed_text += f' \{char} '
        else:
            processed_text += char
    text = processed_text
    start_index = 0
    end_index = len(text)

    # Find the index of the first letter, <, or >
    for i, char in enumerate(text):
        if char.isalpha() or char == '<' or char == '>'or text[i] == '.':
            start_index = i
            break

    # Find the index of the last letter, <, or >
    for i in range(len(text) - 1, -1, -1):
        if text[i].isalpha() or text[i] == '<' or text[i] == '>'or text[i] == '.':
            end_index = i + 1
            break

    # Extract the substring between the first and last letters
    result = text[start_index:end_index].strip()

    result += '.'
    result = "['{}']".format(result)

    return result

In [4]:
def remove_wrapping(text):
    if isinstance(text, str) and text.startswith("['") and text.endswith("']"):
        return text[2:-2]  # Remove the first two and last two characters
    else:
        return text

In [7]:
df = pd.read_csv('CNC_synth_copy2.csv')
df['Text'] = df['Text'].apply(lambda x: process_text(x))
df_o = pd.read_csv('CausalNewsCorpus/data/V2/train_subtask2_grouped.csv')

In [5]:
filter_list = ['<ARG0>','<ARG1>','<SIG0>','</ARG0>','</ARG1>','</SIG0>']

In [8]:

df_ff = filter_dataframe_l(df, filter_list)
df_ff = df_ff[~df_ff['Text'].str.contains('</ARG0><SIG0>')].copy()
df_ff = df_ff[~df_ff['Text'].str.contains('</ARG0></SIG0>')].copy()
df_ff = df_ff[~df_ff['Text'].str.contains('</ARG1><SIG0>')].copy()
# Adding a new column 'sent_id' starting from 10000 and incrementing by 1
df_ff['sent_id'] = range(10000, 10000 + len(df_ff))

# Adding a new column 'corpus' with constant value 'cnc'
df_ff['corpus'] = 'cnc'

# Adding a new column 'eg_id' with constant value 0
df_ff['eg_id'] = 0

# Adding a new column 'doc_id' with values 'train_1000_10000' where 'train_' is constant, and the other two values are incremented by 1
df_ff['doc_id'] = [f'train_{i}_{j}' for i, j in zip(range(1000, 1000 + len(df_ff)), range(10000, 10000 + len(df_ff)))]

# Adding a new column 'index' with values based on variables i, j, k
df_ff['index'] = [f'cnc_train_{i}_{j}_{k}_10' for i, j, k in zip(range(100, 100 + len(df_ff)), range(1000, 1000 + len(df_ff)), range(10000, 10000 + len(df_ff)))]
df_ff.rename(columns={'Text': 'causal_text_w_pairs'}, inplace=True)
df_ff2 = df_ff
df_ff2['text'] = df_ff2['causal_text_w_pairs'].apply(lambda x: re.sub(r'<(/?ARG[01]|/?SIG0)>', '', x))

df_ff2['num_rs'] = 1
df_ff2 = df_ff2[['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text', 'causal_text_w_pairs', 'num_rs']]
df_ff2['text'] = df_ff2['text'].apply(lambda x: process_text(x))
df_ff2['text'] = df_ff2['text'].apply(remove_wrapping)
df_ff2['text'] = df_ff2['text'].str.replace(r'\s{2,}', ' ', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace(r'\s{2,}', ' ', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace('> ', '>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace('><', '> <', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace('</SIG0>', '</SIG0> ', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace(' </ARG0> </SIG0>', '</ARG0> </SIG0>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace(' </ARG0> <SIG0>', '</ARG0> <SIG0>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace(' </ARG1> <SIG0>', '</ARG1> <SIG0>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace('<ARG0> <SIG0>', '<ARG0><SIG0>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace('<ARG1> <SIG0>', '<ARG1><SIG0>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace(' </ARG0> <ARG1><SIG0>', '</ARG0> <ARG1><SIG0>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace(' </ARG0> <ARG1></SIG0>', '</ARG0> <ARG1></SIG0>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace(' </ARG0> </ARG1></SIG0>', '</ARG0> </ARG1></SIG0>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace('</ARG0>', '</ARG0> ', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace('</ARG1>', '</ARG1> ', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace('</ARG1> <SIG0>', '</ARG1><SIG0>', regex=True)
df_ff2['causal_text_w_pairs'] = df_ff2['causal_text_w_pairs'].str.replace(' </ARG1> <ARG0><SIG0>', '</ARG1> <ARG0><SIG0>', regex=True)
print(len(df_ff2))
result_df2 = pd.concat([df_o, df_ff2], ignore_index=True)
#result_df2.to_csv('CNC_chatgpt+orig_output_no_space6.csv', index=None)

794
