In [54]:
print('kernel_on')

kernel_on


# DialogRE Relation Types Breakdown According Kitwood:
- **Objective**: Analyze dialogue relations data and map them to Kitwood's Psychological Needs.

In [55]:
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.statistics import get_counts_and_percentages

from pathlib import Path
import pandas as pd
import json
import glob
import math

# Get a list of all json files in the directory, excluding 'dev'
files = [Path(f) for f in glob.glob(str(LOCAL_PROCESSED_DATA_PATH / "dialog-re-fixed-relations/*.json"))]
# files = [Path(f) for f in glob.glob(str(LOCAL_PROCESSED_DATA_PATH / "dialog-re-ternary/*.json"))]


# Create an empty DataFrame
df1 = pd.DataFrame(columns=["Dialogue", "Relations", "Origin"])

# Loop over all json files in the directory
for file_name in files:
    with open(file_name, 'r') as file:
        data = json.load(file)

        # Convert the data to a DataFrame
        df_temp = pd.DataFrame(data, columns=["Dialogue", "Relations"])

        # Add a new column to this DataFrame for the origin
        df_temp["Origin"] = file_name.stem  # This will get just the file name without the extension

        # Append the temporary DataFrame to the main DataFrame
        df1 = pd.concat([df1, df_temp], ignore_index=True)
df1

Unnamed: 0,Dialogue,Relations,Origin
0,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[{'y': 'casting director', 'x': 'Ann', 'rid': ...",dev
1,"[Speaker 1, Speaker 2: Hi, Speaker 3: Hi! Hey ...","[{'y': 'Speaker 2', 'x': 'Speaker 1', 'rid': [...",dev
2,"[Speaker 1, Speaker 2: Hi!, Speaker 3: Hey!, S...","[{'y': 'man', 'x': 'Speaker 4', 'rid': [37], '...",dev
3,[Speaker 1: Wow! It looks like we got a lot of...,"[{'y': 'baby', 'x': 'Speaker 2', 'rid': [37], ...",dev
4,"[Speaker 1: Now, Mom, everything's going fine,...","[{'y': '26', 'x': 'Speaker 1', 'rid': [25], 'r...",dev
...,...,...,...
1783,"[Speaker 1: Nice camoflauge man, for a minute ...","[{'y': 'Speaker 1', 'x': 'Speaker 2', 'rid': [...",train
1784,"[Speaker 1: Well, I'm sure you'll teach her a ...","[{'y': 'Sir', 'x': 'Speaker 1', 'rid': [37], '...",train
1785,[Speaker 1: You know what? I can't even worry ...,"[{'y': 'baby', 'x': 'Speaker 1', 'rid': [37], ...",train
1786,"[Speaker 1: And cut. Hey, Butt Guy, what the h...","[{'y': 'Butt Guy', 'x': 'Speaker 2', 'rid': [3...",train


In [56]:
get_counts_and_percentages(df1, ['Origin'])

Unnamed: 0_level_0,Counts,%
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1
train,1073,60.0
dev,358,20.0
test,357,20.0


In [59]:
# flatten Relations column
relations = df1['Relations'].apply(pd.Series).stack().reset_index(drop=True)

# create new dataframe
df2 = pd.DataFrame({'Relations': relations})

# extract 'relation_type' from dictionary and add it as a new column in df2
df2['relation_type'] = df2['Relations'].apply(lambda x: x['r'][0])

# drop the 'Relations' column
df2 = df2.drop('Relations', axis=1)

# Now, we can count the amount of each 'relation_type'
relation_stats = get_counts_and_percentages(df2, ['relation_type'])

relation_stats.sum()

Counts    26238.0
%            99.9
dtype: float64

In [119]:
# define relation_type groups according to Kitwood's framework
group_dict = {
    'per:alternate_names': 'Identity',
    'per:girl/boyfriend': 'Attachment',
    'per:positive_impression': 'Comfort',
    'per:friends': 'Attachment',
    'per:title': 'Identity',
    'per:spouse': 'Attachment',
    'per:siblings': 'Attachment',
    'per:parents': 'Attachment',
    'per:children': 'Attachment',
    'per:negative_impression': 'Comfort',
    'per:roommate': 'Attachment',
    'per:alumni': 'Occupation',
    'per:other_family': 'Attachment',
    'gpe:visitors_of_place': 'Inclusion',
    'per:visited_place': 'Inclusion',
    'per:works': 'Occupation',
    'per:client': 'Attachment',
    'gpe:residents_of_place': 'Inclusion',
    'per:place_of_residence': 'Inclusion',
    'per:age': 'Identity',
    'per:boss': 'Occupation',
    'per:employee_or_member_of': 'Occupation',
    'org:employees_or_members': 'Occupation',
    'per:place_of_work': 'Occupation',
    'per:acquaintance': 'Attachment',
    'per:subordinate': 'Occupation',
    'per:neighbor': 'Inclusion',
    'per:pet': 'Attachment',
    'per:dates': 'Attachment',
    'per:origin': 'Identity',
    'per:schools_attended': 'Occupation',
    'org:students': 'Occupation',
    'per:major': 'Identity',
    'per:date_of_birth': 'Identity',
    'gpe:births_in_place': 'Others',
    'per:place_of_birth': 'Identity',
    'unanswerable': 'Others',
    'no_relation': 'Others'
}

# gets Kitwood's groups stats
mask = df2['relation_type'].apply(lambda x: x not in ['unanswerable','no_relation'])
df2['group'] = df2['relation_type'].map(group_dict)
breakdown = get_counts_and_percentages(df2[mask], ['group'])
breakdown

Unnamed: 0_level_0,Counts,%
group,Unnamed: 1_level_1,Unnamed: 2_level_1
Attachment,3088,40.4
Identity,2667,34.9
Comfort,879,11.5
Occupation,607,7.9
Inclusion,408,5.3
Others,1,0.0


In [124]:
# presents stats breakdown of Kitwood's groups  (excluding no relation)
breakdown = get_counts_and_percentages(df2[mask], ['group', 'relation_type']).sort_values('group')

for g in breakdown.index.get_level_values(0).unique():
    tmp = breakdown.loc[breakdown.index.get_level_values(0) == g]
    print(f"Group: {g} | Sample Count: {int(tmp.sum()['Counts']):,} ({tmp.sum()['%']:.1f}%)")
    display(tmp)

Group: Attachment | Sample Count: 3,088 (40.4%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Attachment,per:roommate,193,2.5
Attachment,per:pet,48,0.6
Attachment,per:client,87,1.1
Attachment,per:dates,35,0.5
Attachment,per:other_family,120,1.6
Attachment,per:children,266,3.5
Attachment,per:parents,270,3.5
Attachment,per:acquaintance,66,0.9
Attachment,per:spouse,316,4.1
Attachment,per:friends,648,8.5


Group: Comfort | Sample Count: 879 (11.5%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Comfort,per:negative_impression,222,2.9
Comfort,per:positive_impression,657,8.6


Group: Identity | Sample Count: 2,667 (34.8%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Identity,per:date_of_birth,6,0.1
Identity,per:title,414,5.4
Identity,per:major,6,0.1
Identity,per:origin,26,0.3
Identity,per:place_of_birth,1,0.0
Identity,per:age,78,1.0
Identity,per:alternate_names,2136,27.9


Group: Inclusion | Sample Count: 408 (5.3%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Inclusion,per:neighbor,56,0.7
Inclusion,per:place_of_residence,84,1.1
Inclusion,gpe:residents_of_place,84,1.1
Inclusion,gpe:visitors_of_place,92,1.2
Inclusion,per:visited_place,92,1.2


Group: Occupation | Sample Count: 607 (7.8%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Occupation,per:place_of_work,71,0.9
Occupation,org:employees_or_members,72,0.9
Occupation,per:subordinate,63,0.8
Occupation,per:boss,72,0.9
Occupation,per:works,89,1.2
Occupation,org:students,8,0.1
Occupation,per:schools_attended,8,0.1
Occupation,per:alumni,152,2.0
Occupation,per:employee_or_member_of,72,0.9


Group: Others | Sample Count: 1 (0.0%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Others,gpe:births_in_place,1,0.0


In [127]:
# presents stats breakdown of Kitwood's groups  (including no relation)
breakdown_no_relation = get_counts_and_percentages(df2, ['group', 'relation_type']).sort_values('group')

for g in breakdown_no_relation.index.get_level_values(0).unique():
    tmp = breakdown_no_relation.loc[breakdown_no_relation.index.get_level_values(0) == g]
    print(f"Group: {g} | Sample Count: {int(tmp.sum()['Counts']):,} ({tmp.sum()['%']:.1f}%)")
    display(tmp)

Group: Attachment | Sample Count: 3,088 (11.8%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Attachment,per:client,87,0.3
Attachment,per:pet,48,0.2
Attachment,per:other_family,120,0.5
Attachment,per:dates,35,0.1
Attachment,per:roommate,193,0.7
Attachment,per:children,266,1.0
Attachment,per:parents,270,1.0
Attachment,per:acquaintance,66,0.3
Attachment,per:siblings,303,1.2
Attachment,per:friends,648,2.5


Group: Comfort | Sample Count: 879 (3.3%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Comfort,per:negative_impression,222,0.8
Comfort,per:positive_impression,657,2.5


Group: Identity | Sample Count: 2,667 (10.1%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Identity,per:origin,26,0.1
Identity,per:major,6,0.0
Identity,per:date_of_birth,6,0.0
Identity,per:age,78,0.3
Identity,per:place_of_birth,1,0.0
Identity,per:alternate_names,2136,8.1
Identity,per:title,414,1.6


Group: Inclusion | Sample Count: 408 (1.6%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Inclusion,gpe:residents_of_place,84,0.3
Inclusion,gpe:visitors_of_place,92,0.4
Inclusion,per:visited_place,92,0.4
Inclusion,per:neighbor,56,0.2
Inclusion,per:place_of_residence,84,0.3


Group: Occupation | Sample Count: 607 (2.3%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Occupation,per:place_of_work,71,0.3
Occupation,per:boss,72,0.3
Occupation,per:subordinate,63,0.2
Occupation,per:employee_or_member_of,72,0.3
Occupation,per:alumni,152,0.6
Occupation,per:works,89,0.3
Occupation,org:students,8,0.0
Occupation,per:schools_attended,8,0.0
Occupation,org:employees_or_members,72,0.3


Group: Others | Sample Count: 18,589 (70.8%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
group,relation_type,Unnamed: 2_level_1,Unnamed: 3_level_1
Others,gpe:births_in_place,1,0.0
Others,unanswerable,2099,8.0
Others,no_relation,16489,62.8


`Conclusion`: Relation types with insufficient examples can be addressed by either synthesizing additional data or consolidating them into a new "other" relation category.