In [None]:
import os, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Markdown, Latex

In [None]:
## The dataset used comes from https://github.com/cricketclub/gridspace-stanford-harper-valley
# first download the dataset:

!git clone https://github.com/cricketclub/gridspace-stanford-harper-valley.git

In [None]:
main_path = "gridspace-stanford-harper-valley/data"
audio_path = os.path.join(main_path, "audio")
transcript_path = os.path.join(main_path, "transcript")
metadata_path = os.path.join(main_path, "metadata")

data_by_audio = {} ## dict of metadata by audio

# Analysis

## Get emotion statistics

In [None]:
### Transcription data - Emotions
speaker_roles = ["agent", "caller"]
emotions = ['positive', 'neutral', 'negative']

for md_file in os.listdir(transcript_path):
    file_data = {}
    file_data['transcript_path'] = os.path.join(transcript_path, md_file)
    with open(os.path.join(transcript_path, md_file), 'r') as f:
        data = json.load(f)

    emotion_categorisation_by_speaker = {"agent":[], "caller":[]}
    for dd in data:
        emotion = dd['emotion']
        emotion_categorisation_by_speaker[dd['speaker_role']].append([emotion[emot] for emot in emotions])

    for role in speaker_roles:
        role_emotion = np.array(emotion_categorisation_by_speaker[role])

        for idx, emot in enumerate(emotions): 
            file_data[f'{role}_{emot}_min'] = np.min(role_emotion, axis=0)[idx]
            file_data[f'{role}_{emot}_max'] = np.max(role_emotion, axis=0)[idx]
            file_data[f'{role}_{emot}_mean'] = np.mean(role_emotion, axis=0)[idx]

        ## positive ending ? positive>0.5 and negative<0.2
        if role_emotion[-1][emotions.index('positive')] > 0.45 and role_emotion[-1][emotions.index('negative')] < 0.25:
            file_data[f'{role}_positive_end'] = 1
        else: 
            file_data[f'{role}_positive_end'] = 0
    data_by_audio[md_file] = file_data

## Get metadata information: task for the agent

In [None]:
## ADD Metadata info to data

for md_file in os.listdir(metadata_path):
    with open(os.path.join(metadata_path, md_file), 'r') as f:
        data = json.load(f)
    #display(Markdown(f"```{json.dumps(data, indent=4)}"))
    #display(Markdown(f"# json keys \n {data.keys()}"))
    task_types = []
    for tsk in data["tasks"]:
        #task_keys.append(tsk.keys())
        task_types.append(tsk["task_type"])

    #if len(task_types)>1: 
    #    print(md_file)
    #    for tsk in data["tasks"]: print(f"   - {tsk["task_type"]}")
    #### there is no file with more than one task

    data_by_audio[md_file]["metadata_path"] = os.path.join(metadata_path, md_file)
    data_by_audio[md_file]["task_type"] = task_types[0]
    for itm in ["end_time_ms", "sid", "start_time_ms"]:
        data_by_audio[md_file][itm] = data[itm]

## Save into a DataFrame (to easily get statistics and descriptions)

In [None]:
display(Markdown(f"```{json.dumps(data_by_audio[md_file], indent=4)}"))

In [None]:
df = pd.DataFrame.from_dict(data_by_audio, orient='index')
df.head(2)

In [None]:
print('Number of positive endings:')
pos_ends = np.sum(df['caller_positive_end'])
print(f"{pos_ends}, {pos_ends/len(df):.2f} %")

## Analyse audio files statistics

In [None]:
df.describe()

In [None]:
def draw_plot(data, cols, plot_filter, figsize=(6,3)):
    # Colors (one per column)
    colors = plt.cm.tab20(np.linspace(0, 1, len(cols)))

    if False:
        fig, (ax_box, ax_hist) = plt.subplots(2, 1, figsize=figsize, sharex=False)
        
        # -------------------------
        # 1. BOXPLOT SUBPLOT
        # -------------------------
        bp = ax_box.boxplot(data.dropna(),
                            patch_artist=True)

        for patch, color in zip(bp['boxes'], colors):
            patch.set_facecolor(color)

        ax_box.set_title(f"Boxplots of {plot_filter} Columns")
        ax_box.set_xticks(range(1, len(cols) + 1))
        ax_box.set_xticklabels(cols, rotation=45, ha='right')

        # Legend for boxplots
        legend_patches = [plt.Rectangle((0,0),1,1, color=c) for c in colors]
        ax_box.legend(legend_patches, cols, title="Columns", loc='upper right')

    else:
        fig, ax_hist = plt.subplots(figsize=figsize)

    # -------------------------
    # 2. HISTOGRAM SUBPLOT
    # -------------------------
    for col, color in zip(cols, colors):
        ax_hist.hist(df[col].dropna(), bins=30, alpha=0.5,
                    label=col, color=color)

    ax_hist.set_title(f"Histograms of {plot_filter} Columns")
    ax_hist.set_xlabel("Value")
    ax_hist.set_ylabel("Frequency")
    ax_hist.legend(title="Columns", loc='upper right')

    plt.tight_layout()
    plt.show()

In [None]:
for speaker in ["agent", "caller"]:
    for emotion in ['positive', 'negative']:
        plot_filter = f"{speaker}_{emotion}"
        cols = [c for c in df.columns if plot_filter in c]
        display(Markdown(f"# {plot_filter}"))
        draw_plot(df[cols], cols, plot_filter)


In [None]:
## plot histogram of the tasks:
plot_filter = "task_type"
cols = [c for c in df.columns if plot_filter in c]
display(Markdown(f"# {plot_filter}"))
draw_plot(df[cols], cols, plot_filter, figsize=(12,4))

## Pick up representative samples
To pick up the representative samples, we will create one code representing the following:
- task_type: each one will have a number
- caller negative sentiment: 0: negative, 1: positive, 2: neutral
- caller positive sentiment at the end of the call: 0,1 (no, yes)

In [None]:
task_types_ids={}
for idx,tsk in enumerate(df.task_type.unique()): task_types_ids[tsk] = idx

def get_code(row):
    code = ""
    ## task_type code
    code += str(task_types_ids[row['task_type']]) 

    ## caller negative sentiment code
    #ng_mean = row['caller_negative_mean']
    #if ng_mean<0.3: code += "0"
    #elif ng_mean<0.45: code += "1"
    #else: code+="2"

    ## caller negative sentiment code --- NEW
    ng_mean = row['caller_negative_mean']
    ng_max = row['caller_negative_max']
    pos_mean = row['caller_positive_mean']
    pos_max = row['caller_positive_max']
    if ng_mean>0.32 and ng_max>0.25:
        code+="0" ## negative
    elif pos_mean>0.35:
        code+="1" ## positive
    else:
        code+="2" ## neutral

    ## caller positive sentiment at the end of the call:
    code += str(row['caller_positive_end'])
    return code

df["code"] = df.apply(get_code, axis=1)
df.head(2)

In [None]:
np.unique(df.code, return_counts=True)

In [None]:
num_of_representative_samples = 10
samples_per_code_type = 1 #len(df['code'].unique())//num_of_representative_samples
print(samples_per_code_type)

## Delete problematic files (agent and caller are swapped, and may be incorrect):
list_discarded = ['0cae8d624dc04b47', 'd73be060c5994ae3', '50f57d300f104fa3']
print('data before discard: ', len(df))
df = df[~df.sid.isin(list_discarded)]
print('data after discard: ', len(df))

#selected_samples = df.groupby("code", group_keys=False).sample(frac=num_of_representative_samples, random_state=42, replace=True)
selected_samples = df.groupby("code", group_keys=False).sample(n=samples_per_code_type, random_state=42, replace=True)
selected_samples.to_csv('selected_samples.csv')


## select only negative and positive
#selected_samples = df[df.code.isin(["00", "11"])].groupby("code", group_keys=False).sample(n=samples_per_code_type, random_state=42, replace=True)
#selected_samples.to_csv('selected_samples_2.csv')
len(selected_samples), len(np.unique(df.code))


In [None]:
selected_samples[['sid', 'code', 'task_type']]