In [1]:
import pandas as pd

In [47]:
df = pd.read_csv("clean_results_no_duplicates.csv")

In [48]:
df = df.drop('Unnamed: 0', axis=1)

In [49]:
df = df[df['subject'] != 36]

In [14]:
df

Unnamed: 0,subject,trial,temperature,likert_score,evaluation_prompt,sentece,dataset
0,0,0,0,4,1,Hidden meaning transforms unparalleled abstrac...,0
1,0,1,0,3,1,Good health imparts reality to subtle creativity.,0
2,0,2,0,4,1,Wholeness quiets infinite phenomena.,0
3,0,3,0,2,1,The future explains irrational facts.,0
4,0,4,0,3,1,Imagination is inside exponential space time e...,0
...,...,...,...,...,...,...,...
20898,35,765,1,4,10,At the centre of your being you have the answe...,3
20899,35,766,1,4,10,A wet person does not fear the rain.,3
20900,35,767,1,4,10,Forgiveness means letting go of the hope for a...,3
20901,35,768,1,4,10,Only those who will risk going too far can pos...,3


In [5]:
df["subject"].value_counts()

subject
15    1232
33    1210
0     1118
26    1100
22    1100
9      955
31     940
21     922
25     886
11     881
13     880
30     880
32     869
10     818
16     770
35     770
23     687
6      684
14     585
12     475
4      440
34     420
36     416
8      362
3      349
19     334
7      256
1      250
5      248
17     142
20      80
24      60
27      40
28      40
29      40
2       40
18      40
Name: count, dtype: int64

In [7]:
len(df)

21319

In [16]:
(2 * 10 * 110)

2200

In [50]:
no_score_df = df.drop('likert_score', axis=1)
all_measured_experiments = no_score_df.to_dict(orient='records')

In [24]:
from rate_sentences import load_sentences, assign_dataset_to_sentences

In [25]:
bs_sentences = assign_dataset_to_sentences(load_sentences("./data/BS.xlsx"), "0")
bs_sentences_new = assign_dataset_to_sentences(
    load_sentences("./data/New_BS.xlsx"), "1"
)
bs_sentences_generated = assign_dataset_to_sentences(
    load_sentences("./data/BS_generated.xlsx", use_header=True), "2"
)
motivational_sentences = assign_dataset_to_sentences(
    load_sentences("./data/Motivational.xlsx"), "3"
)
mundane_sentences = assign_dataset_to_sentences(
    load_sentences("./data/Mundane.xlsx"), "4"
)

all_sentences = (
    bs_sentences
    + bs_sentences_generated
    + bs_sentences_new
    + mundane_sentences
    + motivational_sentences
)

In [33]:
from prompts import EVALUATION_PROMPTS_DICT

TEMPERATURES = {"0": 0.1, "1": 0.7}
NUM_SUBJECTS = 35

NUM_SENTENCES = len(all_sentences)
NUM_TEMPERATURES = len(TEMPERATURES)
NUM_EVALUATION_PROMPTS = len(EVALUATION_PROMPTS_DICT)
TOTAL_TRIALS = (
    NUM_SUBJECTS * NUM_SENTENCES * NUM_TEMPERATURES * NUM_EVALUATION_PROMPTS
)

In [86]:
all_experiments = []

for subject in range(NUM_SUBJECTS):
    trial = 0
    for (temperature_type, temperature_value) in TEMPERATURES.items():
        for (evaluation_type, evaluation_prompt) in EVALUATION_PROMPTS_DICT.items():
            for num_sentence in range(NUM_SENTENCES):
                sentence = all_sentences[num_sentence]["sentence"]
                dataset = all_sentences[num_sentence]["dataset"]
                experiment = {
                    "subject": subject,
                    "trial": trial,
                    "temperature": int(temperature_type),
                    "evaluation_prompt": int(evaluation_type),
                    "sentence": sentence,
                    "dataset": int(dataset)
                }
                all_experiments.append(experiment)
                trial += 1

In [87]:
df_all_experiments = pd.DataFrame(all_experiments)
df_all_measured_experiments = pd.DataFrame(all_measured_experiments)

In [54]:
df_all_experiments.columns

Index(['subject', 'trial', 'temperature', 'evaluation_prompt', 'sentence',
       'dataset'],
      dtype='object')

In [59]:
cols_to_cast = ['subject', 'trial', 'temperature', 'evaluation_prompt', 'dataset']
for df in [df_all_experiments, df_all_measured_experiments]:
    for col in cols_to_cast:
        df[col] = df[col].astype(int)

In [60]:
df_all_measured_experiments.columns.equals(df_all_experiments.columns)

True

In [68]:
len(df_all_experiments) + len(df_all_measured_experiments)

97903

In [92]:
merged = pd.concat([df_all_experiments, df_all_measured_experiments])

In [125]:
merged = merged.drop('trial', axis=1)

In [126]:
to_do = merged.drop_duplicates(keep=False)

In [147]:
to_do = to_do[to_do['subject'] != 35]
first_batch = list(to_do["subject"].value_counts().tail(10).index)
first_batch = to_do[to_do['subject'].isin(first_batch)]

first_batch.to_csv('first_batch.csv', index=False)

In [65]:
df_all_measured_experiments

Unnamed: 0,subject,trial,temperature,evaluation_prompt,sentence,dataset
0,0,0,0,1,Hidden meaning transforms unparalleled abstrac...,0
1,0,1,0,1,Good health imparts reality to subtle creativity.,0
2,0,2,0,1,Wholeness quiets infinite phenomena.,0
3,0,3,0,1,The future explains irrational facts.,0
4,0,4,0,1,Imagination is inside exponential space time e...,0
...,...,...,...,...,...,...
20898,35,765,1,10,At the centre of your being you have the answe...,3
20899,35,766,1,10,A wet person does not fear the rain.,3
20900,35,767,1,10,Forgiveness means letting go of the hope for a...,3
20901,35,768,1,10,Only those who will risk going too far can pos...,3


In [66]:
df_all_experiments

Unnamed: 0,subject,trial,temperature,evaluation_prompt,sentence,dataset
0,0,0,0,1,Hidden meaning transforms unparalleled abstrac...,0
1,0,1,0,1,Good health imparts reality to subtle creativity.,0
2,0,2,0,1,Wholeness quiets infinite phenomena.,0
3,0,3,0,1,The future explains irrational facts.,0
4,0,4,0,1,Imagination is inside exponential space time e...,0
...,...,...,...,...,...,...
76995,34,2195,1,10,At the centre of your being you have the answe...,3
76996,34,2196,1,10,A wet person does not fear the rain.,3
76997,34,2197,1,10,Forgiveness means letting go of the hope for a...,3
76998,34,2198,1,10,Only those who will risk going too far can pos...,3


In [88]:
print(all_experiments[0])
print(all_measured_experiments[0])

{'subject': 0, 'trial': 0, 'temperature': 0, 'evaluation_prompt': 1, 'sentence': 'Hidden meaning transforms unparalleled abstract beauty.', 'dataset': 0}
{'subject': 0, 'trial': 0, 'temperature': 0, 'evaluation_prompt': 1, 'sentence': 'Hidden meaning transforms unparalleled abstract beauty.', 'dataset': 0}


In [112]:
not_in_all = []
for experiment in all_measured_experiments:
    if experiment in all_experiments:
        pass
    else:
        not_in_all.append(experiment)

In [114]:
len(not_in_all)

18451

In [115]:
not_in_all[0]

{'subject': 0,
 'trial': 880,
 'temperature': 0,
 'evaluation_prompt': 9,
 'sentence': 'Most people enjoy some sort of music.',
 'dataset': 4}

In [124]:
all_experiments[880:1000]

[{'subject': 0,
  'trial': 880,
  'temperature': 0,
  'evaluation_prompt': 9,
  'sentence': 'Hidden meaning transforms unparalleled abstract beauty.',
  'dataset': 0},
 {'subject': 0,
  'trial': 881,
  'temperature': 0,
  'evaluation_prompt': 9,
  'sentence': 'Good health imparts reality to subtle creativity.',
  'dataset': 0},
 {'subject': 0,
  'trial': 882,
  'temperature': 0,
  'evaluation_prompt': 9,
  'sentence': 'Wholeness quiets infinite phenomena.',
  'dataset': 0},
 {'subject': 0,
  'trial': 883,
  'temperature': 0,
  'evaluation_prompt': 9,
  'sentence': 'The future explains irrational facts.',
  'dataset': 0},
 {'subject': 0,
  'trial': 884,
  'temperature': 0,
  'evaluation_prompt': 9,
  'sentence': 'Imagination is inside exponential space time events.',
  'dataset': 0},
 {'subject': 0,
  'trial': 885,
  'temperature': 0,
  'evaluation_prompt': 9,
  'sentence': 'Your consciousness gives rise to a jumble of neural networks.',
  'dataset': 0},
 {'subject': 0,
  'trial': 886,


In [97]:
all = all_experiments + all_measured_experiments

In [109]:
pd.DataFrame(all).drop_duplicates(keep=False)

Unnamed: 0,subject,temperature,evaluation_prompt,dataset,sentence,trial
0,16,0,6,1,The grid is buzzing with superpositions of pos...,630
1,21,1,6,1,The ego imparts reality to the flow of photons.,1724
2,26,1,3,3,At the centre of your being you have the answe...,1425
3,6,1,7,2,Mind and matter are subtle and dense vibration...,1815
4,0,1,1,0,We are in the midst of a self-aware blossoming...,1110
...,...,...,...,...,...,...
95437,34,0,2,3,"Your teacher can open the door, but you must e...",210
95438,20,1,6,2,Your consciousness gives rise to a jumble of n...,1685
95439,5,1,10,2,Every material quasiparticle is a relationship...,2147
95440,18,0,9,0,Hidden meaning transforms unparalleled abstrac...,880


In [99]:
all = [dict(s) for s in set(frozenset(d.items()) for d in all)]

In [102]:
len(all)

95442

In [53]:
comparison_result = df_all_experiments.compare(df_all_measured_experiments, keep_shape=True)
different_rows = comparison_result[comparison_result['_merge'] != 'both']

ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects

In [90]:
differences = [d for d in all_experiments if d not in all_measured_experiments] + [d for d in all_measured_experiments if d not in all_experiments]

In [91]:
len(differences)

93000

In [96]:
set_list1 = set(tuple(sorted(d.items())) for d in sorted(all_experiments))
set_list2 = set(tuple(sorted(d.items())) for d in sorted(all_measured_experiments))

TypeError: '<' not supported between instances of 'dict' and 'dict'

In [108]:
L=[
    {'id': 1, 'name': 'john', 'age': 34},
    {'id': 1, 'name': 'john', 'age': 34},
    {'id': 2, 'name': 'hanna', 'age': 30},
]
pd.DataFrame(L).drop_duplicates(keep=False)

Unnamed: 0,id,name,age
2,2,hanna,30
