# Compare Step by Step data sets

Compare resurrected items between the data generated with k=5, k=10, k=20 samples for GPT3-turbo and k=5 samples for GPT4o

## Load sbs data

In [2]:
import csv

from utils import group_entropies_by_dialogue_id, has_resurrected_items, group_sbs_data_by_dialogue_id

In [3]:
five_samples_data_path = "./data/generation/8_mcrae/sbs_entropy_k_five.csv"
ten_samples_data_path = "./data/generation/8_mcrae/sbs_entropy_k_ten.csv"
twenty_samples_data_path = "./data/generation/8_mcrae/sbs_entropy_k_twenty.csv"
five_gpt4o_samples_data_path = "./data/generation/8_mcrae/sbs_entropy_k_five_gpt4o.csv"

five_rf = open(five_samples_data_path, 'r', newline='')
ten_rf = open(ten_samples_data_path, 'r', newline='')
twenty_rf = open(twenty_samples_data_path, 'r', newline='')
five_gpt4o_rf = open(five_gpt4o_samples_data_path, 'r', newline='')

five_reader = csv.DictReader(five_rf, delimiter=",")
ten_reader = csv.DictReader(ten_rf, delimiter=",")
twenty_reader = csv.DictReader(twenty_rf, delimiter=",")
five_gpt4o_reader = csv.DictReader(five_gpt4o_rf, delimiter=",")

five_entropies, _ = group_entropies_by_dialogue_id(five_reader)
ten_entropies, _ = group_entropies_by_dialogue_id(ten_reader)
twenty_entropies, _ = group_entropies_by_dialogue_id(twenty_reader)
five_gpt4o_entropies, _ = group_entropies_by_dialogue_id(five_gpt4o_reader)

five_rf.close()
ten_rf.close()
twenty_rf.close()
five_gpt4o_rf.close()

## Count dialogues with resurrected items

In [5]:
five_samples_resurrected_items_count = 0
ten_samples_resurrected_items_count = 0
twenty_samples_resurrected_items_count = 0
five_gpt4o_samples_resurrected_items_count = 0

for dialogue_id in five_entropies:
    dialogue_entropies = five_entropies[dialogue_id]
    if(has_resurrected_items(dialogue_entropies)):
      five_samples_resurrected_items_count += 1
      
for dialogue_id in ten_entropies:
    dialogue_entropies = ten_entropies[dialogue_id]
    if(has_resurrected_items(dialogue_entropies)):
      ten_samples_resurrected_items_count += 1

for dialogue_id in twenty_entropies:
    dialogue_entropies = twenty_entropies[dialogue_id]
    if(has_resurrected_items(dialogue_entropies)):
      twenty_samples_resurrected_items_count += 1

for dialogue_id in five_gpt4o_entropies:
    dialogue_entropies = five_gpt4o_entropies[dialogue_id]
    if(has_resurrected_items(dialogue_entropies)):
      five_gpt4o_samples_resurrected_items_count += 1
      
print(five_samples_resurrected_items_count, ten_samples_resurrected_items_count, twenty_samples_resurrected_items_count, 
      five_gpt4o_samples_resurrected_items_count)

78 77 82 22


## Compute resurrected items percentage

In [6]:
output = {
  "five_samples" : {
    "resurrected_items_count" : five_samples_resurrected_items_count,
    "percentage" : five_samples_resurrected_items_count/len(five_entropies) # / 88
  },
  "ten_samples" : {
    "resurrected_items_count" : ten_samples_resurrected_items_count,
    "percentage" : ten_samples_resurrected_items_count/len(ten_entropies) # / 88
  },
    "twenty_samples" : {
    "resurrected_items_count" : twenty_samples_resurrected_items_count,
    "percentage" : twenty_samples_resurrected_items_count/len(twenty_entropies) # / 88
  },
    "five_samples_gpt4o" : {
    "resurrected_items_count" : five_gpt4o_samples_resurrected_items_count,
    "percentage" : five_gpt4o_samples_resurrected_items_count/len(five_gpt4o_entropies) # / 88
  }
}

print(output)

{'five_samples': {'resurrected_items_count': 78, 'percentage': 0.8863636363636364}, 'ten_samples': {'resurrected_items_count': 77, 'percentage': 0.875}, 'twenty_samples': {'resurrected_items_count': 82, 'percentage': 0.9318181818181818}, 'five_samples_gpt4o': {'resurrected_items_count': 22, 'percentage': 0.25}}


## Compute probability of resurrected items

### Grouping sbs data by dialogue id

In [7]:
five_samples_sbs_data_path = "./data/generation/8_mcrae/dialogues_sbs_k_five_distr_ver2.csv"
ten_samples_sbs_data_path = "./data/generation/8_mcrae/dialogues_sbs_k_ten_distr_ver2.csv"
twenty_samples_sbs_data_path = "./data/generation/8_mcrae/dialogues_sbs_k_twenty_distr.csv"
five_gpt4o_samples_sbs_data_path = "./data/generation/8_mcrae/dialogues_sbs_k_five_gpt4o.csv"

# grouping row together by dialogue id

five_rf = open(five_samples_sbs_data_path, 'r', newline='')
five_reader = csv.DictReader(five_rf, delimiter=",")
five_data = group_sbs_data_by_dialogue_id(five_reader)

ten_rf = open(ten_samples_sbs_data_path, 'r', newline='')
ten_reader = csv.DictReader(ten_rf, delimiter=",")
ten_data = group_sbs_data_by_dialogue_id(ten_reader)

twenty_rf = open(twenty_samples_sbs_data_path, 'r', newline='')
twenty_reader = csv.DictReader(twenty_rf, delimiter=",")
twenty_data = group_sbs_data_by_dialogue_id(twenty_reader)

five_gpt4o_rf = open(five_gpt4o_samples_sbs_data_path, 'r', newline='')
five_gpt4o_reader = csv.DictReader(five_gpt4o_rf, delimiter=",")
five_gpt4o_data = group_sbs_data_by_dialogue_id(five_gpt4o_reader)
print(five_gpt4o_data)

[{'dialogue_id': 0, 'intra_dialogues': [{'intra_dialogue_id': 0, 'p_distribuition': {'elk': 0.25, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0, 'fox': 0.25, 'partridge': 0.0, 'hamster': 0.25, 'buffalo': 0.25}}, {'intra_dialogue_id': 1, 'p_distribuition': {'elk': 0.25, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0, 'fox': 0.25, 'partridge': 0.0, 'hamster': 0.25, 'buffalo': 0.25}}, {'intra_dialogue_id': 2, 'p_distribuition': {'elk': 0.3333, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0, 'fox': 0.3333, 'partridge': 0.0, 'hamster': 0.0, 'buffalo': 0.3333}}, {'intra_dialogue_id': 3, 'p_distribuition': {'elk': 0.0, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0, 'fox': 1.0, 'partridge': 0.0, 'hamster': 0.0, 'buffalo': 0.0}}, {'intra_dialogue_id': 4, 'p_distribuition': {'elk': 0.0, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0, 'fox': 1.0, 'partridge': 0.0, 'hamster': 0.0, 'buffalo': 0.0}}, {'intra_dialogue_id': 5, 'p_distribuition': {'elk': 0.0, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0,

### For each dialogue step, find resurrected items and compute the sum of their probabilities

In [8]:
def compute_resurrected_items_p_sum(grouped_data):
  p_summatory = 0
  counter = 0
  for dialogue_id, dialogue in enumerate(grouped_data):
    print("DIALOGUE: , ", dialogue["dialogue_id"])
    intra_dialogues = dialogue["intra_dialogues"]
    for i in range(0, len(dialogue["intra_dialogues"])):
      resurrected_items = []
      resurrected_p = 0
      
      if(i != 0):
        current_distr = intra_dialogues[i]["p_distribuition"]
        previous_distr = intra_dialogues[i-1]["p_distribuition"]
        
        # finding resurrected items
        for item in list(current_distr.keys()):
          if(previous_distr[item] == 0 and current_distr[item] > previous_distr[item]):
            resurrected_items.append(item)
        
        for item in resurrected_items:
          resurrected_p += current_distr[item]
          
        p_summatory += resurrected_p
        counter += 1
        
      grouped_data[dialogue_id]["intra_dialogues"][i]["resurrected_items"] = resurrected_items
      grouped_data[dialogue_id]["intra_dialogues"][i]["resurrected_items_p"] = resurrected_p
  
  resurrected_p_average = p_summatory / counter
  
  return grouped_data, resurrected_p_average

five_analyzed_data, five_r_p_average = compute_resurrected_items_p_sum(five_data)
ten_analyzed_data, ten_r_p_average = compute_resurrected_items_p_sum(ten_data)
twenty_analyzed_data, twenty_r_p_average = compute_resurrected_items_p_sum(twenty_data)
five_gpt4o_analyzed_data, five_gpt4o_r_p_average = compute_resurrected_items_p_sum(five_gpt4o_data)
print(five_r_p_average, ten_r_p_average, twenty_r_p_average, five_gpt4o_r_p_average)
print(five_gpt4o_analyzed_data)

DIALOGUE: ,  0
DIALOGUE: ,  1
DIALOGUE: ,  2
DIALOGUE: ,  3
DIALOGUE: ,  4
DIALOGUE: ,  5
DIALOGUE: ,  6
DIALOGUE: ,  7
DIALOGUE: ,  8
DIALOGUE: ,  9
DIALOGUE: ,  10
DIALOGUE: ,  11
DIALOGUE: ,  12
DIALOGUE: ,  13
DIALOGUE: ,  14
DIALOGUE: ,  15
DIALOGUE: ,  16
DIALOGUE: ,  17
DIALOGUE: ,  18
DIALOGUE: ,  19
DIALOGUE: ,  20
DIALOGUE: ,  21
DIALOGUE: ,  22
DIALOGUE: ,  23
DIALOGUE: ,  24
DIALOGUE: ,  25
DIALOGUE: ,  26
DIALOGUE: ,  27
DIALOGUE: ,  28
DIALOGUE: ,  29
DIALOGUE: ,  30
DIALOGUE: ,  31
DIALOGUE: ,  32
DIALOGUE: ,  33
DIALOGUE: ,  34
DIALOGUE: ,  35
DIALOGUE: ,  36
DIALOGUE: ,  37
DIALOGUE: ,  38
DIALOGUE: ,  39
DIALOGUE: ,  40
DIALOGUE: ,  41
DIALOGUE: ,  42
DIALOGUE: ,  43
DIALOGUE: ,  44
DIALOGUE: ,  45
DIALOGUE: ,  46
DIALOGUE: ,  47
DIALOGUE: ,  48
DIALOGUE: ,  49
DIALOGUE: ,  50
DIALOGUE: ,  51
DIALOGUE: ,  52
DIALOGUE: ,  53
DIALOGUE: ,  54
DIALOGUE: ,  55
DIALOGUE: ,  56
DIALOGUE: ,  57
DIALOGUE: ,  58
DIALOGUE: ,  59
DIALOGUE: ,  60
DIALOGUE: ,  61
DIALOGUE: ,  62
DI

In [9]:
def dump(path, grouped_data):
  with open(path, "w") as f:
    for dialogue in grouped_data:
      dialogue_id = dialogue["dialogue_id"]
      intra_dialogues = dialogue["intra_dialogues"]
      for step in intra_dialogues:
        
        for r_item in step["resurrected_items"]:
          step["p_distribuition"].pop(r_item)
          
        step["p_distribuition"]["resurrected_items_p"] = step["resurrected_items_p"]
        writer = csv.writer(f)
        writer.writerow([
          dialogue_id,
          step["intra_dialogue_id"],
          step["p_distribuition"],
          step["resurrected_items"],
          step["resurrected_items_p"]
        ])
        
dump("./data/generation/8_mcrae/dialogues_sbs_k_five_distr_w_resurr_p.csv", five_analyzed_data)
dump("./data/generation/8_mcrae/dialogues_sbs_k_ten_distr_w_resurr_p.csv", ten_analyzed_data)
dump("./data/generation/8_mcrae/dialogues_sbs_k_twenty_distr_w_resurr_p.csv", twenty_analyzed_data)
dump("./data/generation/8_mcrae/dialogues_sbs_k_five_gpt4o_distr_w_resurr_p.csv", five_gpt4o_data)