In [116]:
print('kernel_on')

kernel_on


# Exploratory Analysis on DialogRE:
- **Objective**: Analyze dialogue relations data from JSON files
- **Process**: Convert data to DataFrame, flatten Relations column, and extract `relation_type`
- **Analysis**: Study distribution of `relation_type`, with a special focus on excluding `no_relation`

In [93]:
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.statistics import get_pct_value_counts
import pandas as pd
import json
from pathlib import Path
import glob

# Get a list of all json files in the directory, excluding 'dev'
files = [Path(f) for f in glob.glob(str(LOCAL_PROCESSED_DATA_PATH / "dialog-re-fixed-relations/*.json"))]


# Create an empty DataFrame
df1 = pd.DataFrame(columns=["Dialogue", "Relations", "Origin"])

# Loop over all json files in the directory (except for dev.json)
for file_name in files:
    with open(file_name, 'r') as file:
        data = json.load(file)

        # Convert the data to a DataFrame
        df_temp = pd.DataFrame(data, columns=["Dialogue", "Relations"])

        # Add a new column to this DataFrame for the origin
        df_temp["Origin"] = file_name.stem  # This will get just the file name without the extension

        # Append the temporary DataFrame to the main DataFrame
        df1 = pd.concat([df1, df_temp], ignore_index=True)
df1


Unnamed: 0,Dialogue,Relations,Origin
0,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[{'y': 'casting director', 'x': 'Ann', 'rid': ...",dev
1,"[Speaker 1, Speaker 2: Hi, Speaker 3: Hi! Hey ...","[{'y': 'Speaker 2', 'x': 'Speaker 1', 'rid': [...",dev
2,"[Speaker 1, Speaker 2: Hi!, Speaker 3: Hey!, S...","[{'y': 'man', 'x': 'Speaker 4', 'rid': [37], '...",dev
3,[Speaker 1: Wow! It looks like we got a lot of...,"[{'y': 'baby', 'x': 'Speaker 2', 'rid': [37], ...",dev
4,"[Speaker 1: Now, Mom, everything's going fine,...","[{'y': '26', 'x': 'Speaker 1', 'rid': [25], 'r...",dev
...,...,...,...
1783,"[Speaker 1: Nice camoflauge man, for a minute ...","[{'y': 'Speaker 1', 'x': 'Speaker 2', 'rid': [...",train
1784,"[Speaker 1: Well, I'm sure you'll teach her a ...","[{'y': 'Sir', 'x': 'Speaker 1', 'rid': [37], '...",train
1785,[Speaker 1: You know what? I can't even worry ...,"[{'y': 'baby', 'x': 'Speaker 1', 'rid': [37], ...",train
1786,"[Speaker 1: And cut. Hey, Butt Guy, what the h...","[{'y': 'Butt Guy', 'x': 'Speaker 2', 'rid': [3...",train


In [95]:
get_pct_value_counts(df1.Origin)

Unnamed: 0_level_0,count,proportion
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1
train,1073,0.600112
dev,358,0.200224
test,357,0.199664


In [98]:
# import necessary modules
import pandas as pd
from itertools import chain

# assuming df1 is your DataFrame

# flatten Relations column
relations = df1['Relations'].apply(pd.Series).stack().reset_index(drop=True)

# create new dataframe
df2 = pd.DataFrame({'Relations': relations})

# extract 'relation_type' from dictionary and add it as a new column in df2
df2['relation_type'] = df2['Relations'].apply(lambda x: x['r'][0])

# drop the 'Relations' column
df2 = df2.drop('Relations', axis=1)

# Now, we can count the amount of each 'relation_type'
relation_stats = get_pct_value_counts(df2['relation_type'])

relation_stats

Unnamed: 0_level_0,count,proportion
relation_type,Unnamed: 1_level_1,Unnamed: 2_level_1
no_relation,16489,0.62844
per:alternate_names,2136,0.081409
unanswerable,2099,0.079998
per:girl/boyfriend,736,0.028051
per:positive_impression,657,0.02504
per:friends,648,0.024697
per:title,414,0.015779
per:spouse,316,0.012044
per:siblings,303,0.011548
per:parents,270,0.01029


In [118]:
mask = df2.relation_type != 'no_relation'
relation_stats_original = get_pct_value_counts(df2['relation_type'][mask])
relation_stats_original

Unnamed: 0_level_0,count,proportion
relation_type,Unnamed: 1_level_1,Unnamed: 2_level_1
per:alternate_names,2136,0.219099
unanswerable,2099,0.215304
per:girl/boyfriend,736,0.075495
per:positive_impression,657,0.067392
per:friends,648,0.066468
per:title,414,0.042466
per:spouse,316,0.032414
per:siblings,303,0.03108
per:parents,270,0.027695
per:children,266,0.027285


In [130]:
df1['char_count'] = df1.apply(lambda x: len(f"{x.Dialogue},{x.Relations}"), axis=1)

df1

Unnamed: 0,Dialogue,Relations,Origin,char_count
0,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[{'y': 'casting director', 'x': 'Ann', 'rid': ...",dev,9235
1,"[Speaker 1, Speaker 2: Hi, Speaker 3: Hi! Hey ...","[{'y': 'Speaker 2', 'x': 'Speaker 1', 'rid': [...",dev,2195
2,"[Speaker 1, Speaker 2: Hi!, Speaker 3: Hey!, S...","[{'y': 'man', 'x': 'Speaker 4', 'rid': [37], '...",dev,8808
3,[Speaker 1: Wow! It looks like we got a lot of...,"[{'y': 'baby', 'x': 'Speaker 2', 'rid': [37], ...",dev,1959
4,"[Speaker 1: Now, Mom, everything's going fine,...","[{'y': '26', 'x': 'Speaker 1', 'rid': [25], 'r...",dev,964
...,...,...,...,...
1783,"[Speaker 1: Nice camoflauge man, for a minute ...","[{'y': 'Speaker 1', 'x': 'Speaker 2', 'rid': [...",train,2660
1784,"[Speaker 1: Well, I'm sure you'll teach her a ...","[{'y': 'Sir', 'x': 'Speaker 1', 'rid': [37], '...",train,1984
1785,[Speaker 1: You know what? I can't even worry ...,"[{'y': 'baby', 'x': 'Speaker 1', 'rid': [37], ...",train,1437
1786,"[Speaker 1: And cut. Hey, Butt Guy, what the h...","[{'y': 'Butt Guy', 'x': 'Speaker 2', 'rid': [3...",train,655


In [None]:
df1.plot(y='char_count', kind='box')

In [139]:
stats = df1['char_count'].describe()
stats

count     1788.000000
mean      2661.460291
std       2044.283050
min        211.000000
25%       1310.750000
50%       2109.500000
75%       3381.250000
max      18144.000000
Name: char_count, dtype: float64

In [166]:
stats = df1['char_count'].describe()
mean_character_count = stats['mean']
average_tokens = mean_character_count / 5
total_tokens = average_tokens * 1000

# Cost calculation
cost_per_token = 0.0015  # Cost per token for 4K context (ChatGPT)
estimated_cost = total_tokens * cost_per_token

# Display the estimated cost
print(f"The estimated cost for synthesizing 1000 samples of data is ${estimated_cost:.2f}")

The estimated cost for synthesizing 1000 samples of data is $0.00


`Conclusion`: Class imbalance is a problem that should be tackled.