# Compare Step by Step data sets

Compare resurrected items between the data generated with k=5 and k=10 samples

## Load sbs data

In [1]:
import csv

from utils import group_entropies_by_dialogue_id, has_resurrected_items, group_sbs_data_by_dialogue_id

In [2]:
five_samples_data_path = "./data/generation/8_mcrae/sbs_entropy_k_five.csv"
ten_samples_data_path = "./data/generation/8_mcrae/sbs_entropy_k_ten.csv"

five_rf = open(five_samples_data_path, 'r', newline='')
ten_rf = open(ten_samples_data_path, 'r', newline='')

five_reader = csv.DictReader(five_rf, delimiter=",")
ten_reader = csv.DictReader(ten_rf, delimiter=",")

five_entropies, _ = group_entropies_by_dialogue_id(five_reader)
ten_entropies, _ = group_entropies_by_dialogue_id(ten_reader)

five_rf.close()
ten_rf.close()


## Count dialogues with resurrected items

In [3]:
five_samples_resurrected_items_count = 0
ten_samples_resurrected_items_count = 0

for dialogue_id in five_entropies:
    dialogue_entropies = five_entropies[dialogue_id]
    if(has_resurrected_items(dialogue_entropies)):
      five_samples_resurrected_items_count += 1
      
for dialogue_id in ten_entropies:
    dialogue_entropies = ten_entropies[dialogue_id]
    if(has_resurrected_items(dialogue_entropies)):
      ten_samples_resurrected_items_count += 1
      
print(five_samples_resurrected_items_count, ten_samples_resurrected_items_count)

46 44


## Compute resurrected items percentage

In [4]:
output = {
  "five_samples" : {
    "resurrected_items_count" : five_samples_resurrected_items_count,
    "percentage" : five_samples_resurrected_items_count/len(five_entropies) # / 53
  },
  "ten_samples" : {
    "resurrected_items_count" : ten_samples_resurrected_items_count,
    "percentage" : ten_samples_resurrected_items_count/len(ten_entropies) # / 45
  }
}

print(output)

{'five_samples': {'resurrected_items_count': 46, 'percentage': 0.9019607843137255}, 'ten_samples': {'resurrected_items_count': 44, 'percentage': 0.9777777777777777}}


## Compute probability of resurrected items

### Grouping sbs data by dialogue id

In [5]:
five_samples_sbs_data_path = "./data/generation/8_mcrae/dialogues_sbs_k_five_distr.csv"
ten_samples_sbs_data_path = "./data/generation/8_mcrae/dialogues_sbs_k_ten_distr.csv"

# grouping row together by dialogue id

five_rf = open(five_samples_sbs_data_path, 'r', newline='')
five_reader = csv.DictReader(five_rf, delimiter=",")
five_data = group_sbs_data_by_dialogue_id(five_reader)

ten_rf = open(ten_samples_sbs_data_path, 'r', newline='')
ten_reader = csv.DictReader(ten_rf, delimiter=",")
ten_data = group_sbs_data_by_dialogue_id(ten_reader)
print(ten_data)

[{'dialogue_id': 0, 'intra_dialogues': [{'intra_dialogue_id': 0, 'p_distribuition': {'elk': 0.25, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0, 'fox': 0.25, 'partridge': 0.0, 'hamster': 0.25, 'buffalo': 0.25}}, {'intra_dialogue_id': 1, 'p_distribuition': {'elk': 0.2273, 'chicken': 0.0909, 'robin': 0.0, 'starling': 0.0, 'fox': 0.2273, 'partridge': 0.0, 'hamster': 0.2273, 'buffalo': 0.2273}}, {'intra_dialogue_id': 2, 'p_distribuition': {'elk': 0.2703, 'chicken': 0.0811, 'robin': 0.0, 'starling': 0.0, 'fox': 0.2703, 'partridge': 0.0, 'hamster': 0.1081, 'buffalo': 0.2703}}, {'intra_dialogue_id': 3, 'p_distribuition': {'elk': 0.0435, 'chicken': 0.0435, 'robin': 0.0, 'starling': 0.087, 'fox': 0.4348, 'partridge': 0.0435, 'hamster': 0.3043, 'buffalo': 0.0435}}, {'intra_dialogue_id': 4, 'p_distribuition': {'elk': 0.3226, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0323, 'fox': 0.3226, 'partridge': 0.0, 'hamster': 0.2903, 'buffalo': 0.0323}}, {'intra_dialogue_id': 5, 'p_distribuition': {'elk':

### For each dialogue step, find resurrected items and compute the sum of their probabilities

In [6]:
def compute_resurrected_items_p_sum(grouped_data):

  for dialogue_id, dialogue in enumerate(grouped_data):
    
    intra_dialogues = dialogue["intra_dialogues"]
    for i in range(0, len(dialogue["intra_dialogues"])):
      resurrected_items = []
      resurrected_p = 0
      
      if(i != 0):
        current_distr = intra_dialogues[i]["p_distribuition"]
        previous_distr = intra_dialogues[i-1]["p_distribuition"]
        
        # finding resurrected items
        for item in list(current_distr.keys()):
          if(previous_distr[item] == 0 and current_distr[item] > previous_distr[item]):
            resurrected_items.append(item)
        
        for item in resurrected_items:
          resurrected_p += current_distr[item]
        
      grouped_data[dialogue_id]["intra_dialogues"][i]["resurrected_items"] = resurrected_items
      grouped_data[dialogue_id]["intra_dialogues"][i]["resurrected_items_p"] = resurrected_p
  
  return grouped_data

five_analyzed_data = compute_resurrected_items_p_sum(five_data)
ten_analyzed_data = compute_resurrected_items_p_sum(ten_data)
print(five_analyzed_data)

[{'dialogue_id': 0, 'intra_dialogues': [{'intra_dialogue_id': 0, 'p_distribuition': {'elk': 0.25, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0, 'fox': 0.25, 'partridge': 0.0, 'hamster': 0.25, 'buffalo': 0.25}, 'resurrected_items': [], 'resurrected_items_p': 0}, {'intra_dialogue_id': 1, 'p_distribuition': {'elk': 0.25, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0, 'fox': 0.25, 'partridge': 0.0, 'hamster': 0.25, 'buffalo': 0.25}, 'resurrected_items': [], 'resurrected_items_p': 0}, {'intra_dialogue_id': 2, 'p_distribuition': {'elk': 0.3333, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.0, 'fox': 0.3333, 'partridge': 0.0, 'hamster': 0.0, 'buffalo': 0.3333}, 'resurrected_items': [], 'resurrected_items_p': 0}, {'intra_dialogue_id': 3, 'p_distribuition': {'elk': 0.0, 'chicken': 0.0, 'robin': 0.0, 'starling': 0.3, 'fox': 0.2, 'partridge': 0.0, 'hamster': 0.5, 'buffalo': 0.0}, 'resurrected_items': ['starling', 'hamster'], 'resurrected_items_p': 0.8}, {'intra_dialogue_id': 4, 'p_distribuition': 

In [7]:
def dump(path, grouped_data):
  for dialogue in grouped_data:
    dialogue_id = dialogue["dialogue_id"]
    intra_dialogues = dialogue["intra_dialogues"]
    for step in intra_dialogues:
      
      for r_item in step["resurrected_items"]:
        step["p_distribuition"].pop(r_item)
        
      step["p_distribuition"]["resurrected_items_p"] = step["resurrected_items_p"]
      
      with open(path, "a") as f:
        writer = csv.writer(f, )
        writer.writerow([
          dialogue_id,
          step["intra_dialogue_id"],
          step["p_distribuition"],
          step["resurrected_items"],
          step["resurrected_items_p"]
        ])
        
dump("./data/generation/8_mcrae/dialogues_sbs_k_five_distr_w_resurr_p.csv", five_analyzed_data)
dump("./data/generation/8_mcrae/dialogues_sbs_k_ten_distr_w_resurr_p.csv", ten_analyzed_data)