### creating the datasets

In [2]:
import pandas as pd

# Load the TSV dataset 
file_path = '../data/cad_v1_1.tsv' 
df = pd.read_csv(file_path, sep='\t')


In [6]:
def fetch_parent_info(row, df, parent_infos, level=0):
    parent_id = row['info_id.parent']
    # Check if the parent ID is in the DataFrame
    if pd.isna(parent_id) or parent_id not in df['info_id'].values:
        # Check for a corresponding '-title' entry
        if f"{parent_id}-title" in df['info_id'].values:
            parent_id = f"{parent_id}-title"
        # Check for a corresponding '-post' entry
        elif f"{parent_id}-post" in df['info_id'].values:
            parent_id = f"{parent_id}-post"
        else:
            # No more parents to fetch
            return
    parent_row = df.loc[df['info_id'] == parent_id].iloc[0]
    parent_infos[f'parent_text_level_{level}'] = parent_row['meta_text']
    parent_infos[f'parent_user_level_{level}'] = parent_row['meta_author']
    # Recursively fetch the next level parent
    fetch_parent_info(parent_row, df, parent_infos, level + 1)


In [8]:
max_levels = 15  # Set to the maximum expected number of parent levels
for level in range(max_levels):
    df[f'parent_text_level_{level}'] = None
    df[f'parent_user_level_{level}'] = None

# Apply the function to each row and store the information in new columns
for index, row in df.iterrows():
    parent_infos = {}
    fetch_parent_info(row, df, parent_infos)
    for key, value in parent_infos.items():
        df.at[index, key] = value

In [9]:
if 'parent_comments' in df.columns:
    df.drop(columns=['parent_comments'], inplace=True)
if 'parent_comments_fetched' in df.columns:
    df.drop(columns=['parent_comments_fetched'], inplace=True)


In [10]:
df_train = df[df['split'] == 'train']
df_dev = df[df['split'] == 'dev']
df_test = df[df['split'] == 'test']

In [11]:
df.to_csv('../data/cad_v1_1_parents.csv', index=False)
df_train.to_csv("../data/train.csv", index=False)
df_dev.to_csv("../data/dev.csv", index=False)
df_test.to_csv("../data/test.csv", index=False)