In [2]:
import json
import pandas as pd

with open("data_with_prompts_2_with_label_field.json", "rb") as d:
   actual_tagging = json.loads(d.read())

df_gt = pd.DataFrame(actual_tagging)
df_gt = df_gt.drop_duplicates(subset=["input", "label"])
df_gt = df_gt.reset_index(drop=True)
df_gt = df_gt[["label", "input", "output"]]
# save to json
df_gt.to_json("gt_annotation.json", orient="records")

In [3]:
import pandas as pd
df_gt = pd.read_json("gt_annotation.json", orient="records")
df_gt

Unnamed: 0,label,input,output
0,Artifacts,"Then Sarah had an idea, another dream.",[]
1,Behavioral,"Then Sarah had an idea, another dream.",[]
2,Biological Kind,"Then Sarah had an idea, another dream.",[Sarah]
3,Mental State,"Then Sarah had an idea, another dream.","[dream, idea]"
4,Non-Living Kind,"Then Sarah had an idea, another dream.",[]
...,...,...,...
5107,Non-Living Kind,That made him feel better.,[]
5108,Normative Feature,That made him feel better.,[better]
5109,Perceptual,That made him feel better.,[]
5110,Social Kind/Role,That made him feel better.,[]


In [12]:
import re
# find start and end index of the label in the input text
def find_start_end_index(input_text, label):
    start_index = input_text.find(label)
    if start_index == -1:
        return -1, -1
    elif input_text.find(label, start_index + 1) != -1:
        # if the label is present more than once in the input text
        return -2, -2

    end_index = start_index + len(label)

    return start_index, end_index

# create a list of dictionaries with start and end index of the label in the input text
def create_labels_list(df):
    labels = []
    for _,row in df.iterrows():
        for word in row['output']:
            input_text = row['input']
            start_indices = [m.start() for m in re.finditer(word, input_text)]
            for start_index in start_indices:
                end_index = start_index + len(word)
                labels.append(
                    {
                        "sentence": row['input'], 
                        "word": word,
                        "label": row['label'], 
                        "start": start_index, 
                        "end": end_index, 
                        "sure": len(start_indices) == 1
                    }
                )


    return labels

labels = create_labels_list(df_gt)
# create a dataframe from the list of dictionaries
df_labels = pd.DataFrame(labels)
df_labels

Unnamed: 0,sentence,word,label,start,end,sure
0,"Then Sarah had an idea, another dream.",Sarah,Biological Kind,5,10,True
1,"Then Sarah had an idea, another dream.",dream,Mental State,32,37,True
2,"Then Sarah had an idea, another dream.",idea,Mental State,18,22,True
3,"“Wow, what a big house you have!” says Bailey.",house,Artifacts,17,22,True
4,"“Wow, what a big house you have!” says Bailey.",have,Behavioral,27,31,True
...,...,...,...,...,...,...
5884,I Am Positive.,Positive,Mental State,5,13,True
5885,That made him feel better.,made,Behavioral,5,9,True
5886,That made him feel better.,him,Biological Kind,10,13,True
5887,That made him feel better.,feel,Mental State,14,18,True


In [17]:
df_labels = df_labels.drop_duplicates(subset=["sentence", "word", "label", "start", "end"])
df_labels = df_labels.reset_index(drop=True)
df_labels

Unnamed: 0,sentence,word,label,start,end,sure
0,"Then Sarah had an idea, another dream.",Sarah,Biological Kind,5,10,True
1,"Then Sarah had an idea, another dream.",dream,Mental State,32,37,True
2,"Then Sarah had an idea, another dream.",idea,Mental State,18,22,True
3,"“Wow, what a big house you have!” says Bailey.",house,Artifacts,17,22,True
4,"“Wow, what a big house you have!” says Bailey.",have,Behavioral,27,31,True
...,...,...,...,...,...,...
4510,I Am Positive.,Positive,Mental State,5,13,True
4511,That made him feel better.,made,Behavioral,5,9,True
4512,That made him feel better.,him,Biological Kind,10,13,True
4513,That made him feel better.,feel,Mental State,14,18,True


In [18]:
# find all sure = False labels and save to csv
df_not_sure = df_labels[df_labels["sure"] == False]
df_not_sure

Unnamed: 0,sentence,word,label,start,end,sure
30,Glossary content (kuhn-TENT) To feel content m...,content,Mental State,9,16,False
31,Glossary content (kuhn-TENT) To feel content m...,content,Mental State,37,44,False
32,Glossary content (kuhn-TENT) To feel content m...,feel,Mental State,32,36,False
33,Glossary content (kuhn-TENT) To feel content m...,feel,Mental State,54,58,False
51,A seahorse scuttled… a raindrop puddled… An ow...,owl,Biological Kind,44,47,False
...,...,...,...,...,...,...
4453,"""You shall put on your things and help me, and...",me,Biological Kind,57,59,False
4454,"""You shall put on your things and help me, and...",me,Biological Kind,78,80,False
4473,"Then someone came and asked me, Can we paint t...",me,Biological Kind,7,9,False
4474,"Then someone came and asked me, Can we paint t...",me,Biological Kind,15,17,False


In [26]:
# df_labels[df_labels["sentence"] == "lt is Ellie,eagerly splashing on her bellyEllie enjoys the hot, hot sun."]["sentence"] = "It is Ellie,eagerly splashing on her belly Ellie enjoys the hot, hot sun."
df_not_sure = df_labels[df_labels["word"] == "judges"]

df_not_sure

Unnamed: 0,sentence,word,label,start,end,sure
284,We need one more letter to help judge the cont...,judges,Social Kind/Role,177,183,True


In [19]:
df_not_sure.to_csv("not_sure_labels.csv")

In [14]:
# drop row 56 and print the row
print(df_labels.iloc[56])
df_labels = df_labels.drop(56)



sentence    A seahorse scuttled… a raindrop puddled… An ow...
word                                                      owl
label                                         Biological Kind
start                                                      49
end                                                        52
sure                                                    False
Name: 56, dtype: object


In [18]:
# find all start=-1
for _,row in df_labels.iterrows():
    if row['start'] == -2:
        print(row['sentence'])
        print(row['word'])
        print(row['label'])
        print(row['start'])
        print(row['end'])
        print()

Glossary content (kuhn-TENT) To feel content means to feel happy and peaceful.
content
Mental State
-2
-2

Glossary content (kuhn-TENT) To feel content means to feel happy and peaceful.
feel
Mental State
-2
-2

Glossary content (kuhn-TENT) To feel content means to feel happy and peaceful.
content
Mental State
-2
-2

Glossary content (kuhn-TENT) To feel content means to feel happy and peaceful.
feel
Mental State
-2
-2

A seahorse scuttled… a raindrop puddled… An owl howled, sad, at the moon.
owl
Biological Kind
-2
-2

Now, I wanted just to tell you, and it didn't take long the way I feel about you is a kind of a song.
I
Biological Kind
-2
-2

Now, I wanted just to tell you, and it didn't take long the way I feel about you is a kind of a song.
you
Biological Kind
-2
-2

Now, I wanted just to tell you, and it didn't take long the way I feel about you is a kind of a song.
I
Biological Kind
-2
-2

Now, I wanted just to tell you, and it didn't take long the way I feel about you is a kind of 

## Clean the data

In [28]:
df_not_sure = pd.read_csv("not_sure_labels.csv", index_col=0)
df_not_sure

Unnamed: 0,sentence,word,label,start,end,sure
30,Glossary content (kuhn-TENT) To feel content m...,content,Mental State,9,16,False
31,Glossary content (kuhn-TENT) To feel content m...,content,Mental State,37,44,False
32,Glossary content (kuhn-TENT) To feel content m...,feel,Mental State,32,36,False
33,Glossary content (kuhn-TENT) To feel content m...,feel,Mental State,54,58,False
51,A seahorse scuttled… a raindrop puddled… An ow...,owl,Biological Kind,44,47,False
...,...,...,...,...,...,...
4418,no Is he in the piano?,he,Biological Kind,6,8,False
4449,"""You shall put on your things and help me, and...",we,Biological Kind,52,54,False
4450,"""You shall put on your things and help me, and...",we,Biological Kind,66,68,False
4452,"""You shall put on your things and help me, and...",me,Biological Kind,39,41,False


In [33]:
df_labels.iloc[51]

sentence    A seahorse scuttled… a raindrop puddled… An ow...
word                                                      owl
label                                         Biological Kind
start                                                      44
end                                                        47
sure                                                    False
Name: 51, dtype: object

In [32]:
# for all rows in df_not_sure, replace the df_labels row with the df_not_sure row
for i, row in df_not_sure.iterrows():
    print(type(i))
    print(type(row))
    df_labels.iloc[i] = row
    df_labels.iloc[i]["sure"] = True
df_labels

30


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

## Assign id to each sentence

In [2]:
# read final_data.json
import json
import pandas as pd
with open("final_data.json", "rb") as d:
    final_data = json.loads(d.read())

df_final = pd.DataFrame(final_data)
df_final

Unnamed: 0,sentence,word,label,start,end
0,"Then Sarah had an idea, another dream.",Sarah,Biological Kind,5,10
1,"Then Sarah had an idea, another dream.",dream,Mental State,32,37
2,"Then Sarah had an idea, another dream.",idea,Mental State,18,22
3,"“Wow, what a big house you have!” says Bailey.",house,Artifacts,17,22
4,"“Wow, what a big house you have!” says Bailey.",have,Behavioral,27,31
...,...,...,...,...,...
4121,I Am Positive.,Positive,Mental State,5,13
4122,That made him feel better.,made,Behavioral,5,9
4123,That made him feel better.,him,Biological Kind,10,13
4124,That made him feel better.,feel,Mental State,14,18


In [6]:
# Assign id to each different sentence
df_final["sentence_id"] = df_final.groupby("sentence").ngroup()
df_final[df_final['sentence'] == "We need one more letter to help judge the contestants, and you’ll be perfect!” said the Letter J to the Letter F. Without saying a word, the Letter F sat down next to the other judges and watched the letters perform, one by one."]

Unnamed: 0,sentence,word,label,start,end,sentence_id
251,We need one more letter to help judge the cont...,letter,Anthropomorphized,17,23,465
252,We need one more letter to help judge the cont...,letters,Anthropomorphized,200,207,465
253,We need one more letter to help judge the cont...,letters,Artifacts,200,207,465
254,We need one more letter to help judge the cont...,word,Artifacts,131,135,465
255,We need one more letter to help judge the cont...,watched,Behavioral,188,195,465
256,We need one more letter to help judge the cont...,said,Behavioral,79,83,465
257,We need one more letter to help judge the cont...,sat,Behavioral,150,153,465
258,We need one more letter to help judge the cont...,perform,Behavioral,208,215,465
259,We need one more letter to help judge the cont...,help,Behavioral,27,31,465
260,We need one more letter to help judge the cont...,saying,Behavioral,122,128,465


In [5]:
# save to json
df_final.to_json("final_data.json", orient="records")