In [1]:
print('kernel_on')

kernel_on


# Exploratory Analysis on DialogRE:
- **Objective**: Analyze dialogue relations data from JSON files & estimate data synthesization costs
- **Process**: Convert data to DataFrame, flatten Relations column, and extract `relation_type`
- **Analysis**: Study distribution of `relation_type`, with a special focus on excluding `no_relation`

In [8]:
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.statistics import get_counts_and_percentages

from pathlib import Path
import pandas as pd
import json
import glob
import math

# Get a list of all json files in the directory, excluding 'dev'
for folder in ("with-no-relation", "ternary", "binary"):
    files = [Path(f) for f in glob.glob(str(LOCAL_PROCESSED_DATA_PATH / f"dialog-re-{folder}/*.json"))]
    
    # Create an empty DataFrame
    df1 = pd.DataFrame(columns=["Dialogue", "Relations", "Origin"])
    
    # Loop over all json files in the directory
    for file_name in files:
        with open(file_name, 'r') as file:
            data = json.load(file)
    
            # Convert the data to a DataFrame
            df_temp = pd.DataFrame(data, columns=["Dialogue", "Relations"])
    
            # Add a new column to this DataFrame for the origin
            df_temp["Origin"] = file_name.stem  # This will get just the file name without the extension
    
            # Append the temporary DataFrame to the main DataFrame
            df1 = pd.concat([df1, df_temp], ignore_index=True)
    
    # flatten Relations column
    relations = df1['Relations'].apply(pd.Series).stack().reset_index(drop=True)
    
    # create new dataframe
    df2 = pd.DataFrame({'Relations': relations})
    
    # extract 'relation_type' from dictionary and add it as a new column in df2
    df2['relation_type'] = df2['Relations'].apply(lambda x: x['r'][0])
    
    # drop the 'Relations' column
    df2 = df2.drop('Relations', axis=1)
    
    # Now, we can count the amount of each 'relation_type'
    relation_stats = get_counts_and_percentages(df2, ['relation_type'])
    
    relation_stats

Unnamed: 0_level_0,Counts,%
relation_type,Unnamed: 1_level_1,Unnamed: 2_level_1
no_relation,16489,62.8
with_relation,9749,37.2


Unnamed: 0_level_0,Counts,%
relation_type,Unnamed: 1_level_1,Unnamed: 2_level_1
no_relation,16489,62.8
with_relation,9749,37.2
