In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
from transformers import pipeline

triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')


# Function to parse the generated text and extract the triplets
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'subject': subject.strip(), 'relation': relation.strip(),'object': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'subject': subject.strip(), 'relation': relation.strip(),'object': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'subject': subject.strip(), 'relation': relation.strip(),'object': object_.strip()})
    return triplets

In [9]:
input_text = "Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic"
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(input_text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])

print(extracted_text[0])
extracted_triplets = extract_triplets(extracted_text[0])
extracted_triplets

<s><triplet> Punta Cana <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> Higuey <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> La Altagracia Province <subj> Dominican Republic <obj> country <triplet> Dominican Republic <subj> La Altagracia Province <obj> contains administrative territorial entity</s>


[{'subject': 'Punta Cana',
  'relation': 'located in the administrative territorial entity',
  'object': 'La Altagracia Province'},
 {'subject': 'Punta Cana',
  'relation': 'country',
  'object': 'Dominican Republic'},
 {'subject': 'Higuey',
  'relation': 'located in the administrative territorial entity',
  'object': 'La Altagracia Province'},
 {'subject': 'Higuey', 'relation': 'country', 'object': 'Dominican Republic'},
 {'subject': 'La Altagracia Province',
  'relation': 'country',
  'object': 'Dominican Republic'},
 {'subject': 'Dominican Republic',
  'relation': 'contains administrative territorial entity',
  'object': 'La Altagracia Province'}]

In [10]:
[print(t) for t in extracted_triplets]

{'subject': 'Punta Cana', 'relation': 'located in the administrative territorial entity', 'object': 'La Altagracia Province'}
{'subject': 'Punta Cana', 'relation': 'country', 'object': 'Dominican Republic'}
{'subject': 'Higuey', 'relation': 'located in the administrative territorial entity', 'object': 'La Altagracia Province'}
{'subject': 'Higuey', 'relation': 'country', 'object': 'Dominican Republic'}
{'subject': 'La Altagracia Province', 'relation': 'country', 'object': 'Dominican Republic'}
{'subject': 'Dominican Republic', 'relation': 'contains administrative territorial entity', 'object': 'La Altagracia Province'}


[None, None, None, None, None, None]

In [11]:
input_dialogue =  [
   "Speaker 1: Hey!",
   "Speaker 2: Hey.",
   "Speaker 3: Hey, man. What's up?",
   "Speaker 1: Maybe you can tell me. My agent would like to know why I didn't show up at the audition I didn't know I had today. The first good thing she gets me in weeks. How could you not give me the message?!",
   "Speaker 3: Well, I'll tell ya I do enjoy guilt, but, ah, it wasn't me.",
   "Speaker 2: Yes, it was! It was him! Uh huh! Okay, it was me!",
   "Speaker 1: How is it you?",
   "Speaker 2: Well, it was just, it was all so crazy, you know. I mean, Chandler was in the closet, counting to 10, and he was up to 7 and I hadn't found a place to hide yet. I-I-I meant to tell you, and I wrote it all down on my hand. See, all of it.",
   "Speaker 1: Yep, that's my audition.",
   "Speaker 4: See, now this is why I keep notepads everywhere.",
   "Speaker 2: Yep, and that's why we don't invite you to play.",
   "Speaker 5: What is the great tragedy here? You go get yourself another appointment.",
   "Speaker 1: Well, Estelle tried, you know. The casting director told her that I missed my chance.",
   "Speaker 2: That is unfair. I'll call her and tell her it was totally my fault.",
   "Speaker 1: Pheebs, you can't do that. The casting director doesn't talk to friends, she only talks to agents.",
   "Speaker 2: What a sad little life she must lead. Okay, ooh.",
   "Speaker 1: What, what are you doing? What are you doing?",
   "Speaker 2: No, no, no, I know, I know, ooh. 'Hi, this is Katelynn, from Phoebe Buffay's office. Um, is um, Ann there for Phoebe, she'll know what it's about.'",
   "Speaker 1: Hang up, hang up.",
   "Speaker 2: 'Annie! Hi. Listen we got a problem with Joey Tribbiani, apparently he missed his audition. Who did you speak to in my office? Estelle, no, I don't know what I'm going to do with her. No. All right, so your husband leaves and burns down the apartment, the world does not stop.'",
   "Speaker 3: Is anybody else scared?",
   "Speaker 2: 'Right, well look, um, if Joey loses this audition, that is it for Estelle. I don't care! Annie you are a doll, what time can you see him?' I need a pen.",
   "Speaker 3: Get the woman a pad! Get the woman a pad! A pad! A pad!",
   "Speaker 4: Oh, now you want a pad."
  ]
  
  
input_text = "\n".join(input_dialogue)
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(input_text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])

print(extracted_text[0])
extracted_triplets = extract_triplets(extracted_text[0])
extracted_triplets

<s><triplet> My agent would like to know why I didn't show up at the audition I didn't know I had today. I-I-I meant to tell you, and I wrote it all down on my hand. I don't care! Annie you are a doll, what time can you see him <subj> my audition <obj> present in work <triplet> my audition <subj> My agent would like to know why I didn't show up at the audition I didn't know I had today. I-I-I meant to tell you, and I wrote it all down on my hand. I don't care! Annie you</s>


[{'subject': "My agent would like to know why I didn't show up at the audition I didn't know I had today. I-I-I meant to tell you, and I wrote it all down on my hand. I don't care! Annie you are a doll, what time can you see him",
  'relation': 'present in work',
  'object': 'my audition'}]