# Data Preprocessing : Synthetic Dataset Generation

### Dependencies

In [None]:
!pip install spacy

In [6]:
import pandas as pd

In [None]:
!git clone https://github.com/raingo/TGIF-Release/

In [8]:
df = pd.read_csv('/content/TGIF-Release/data/tgif-v1.0.tsv', sep = '\t', names =['url','desc'])

In [None]:
df.head()

## Synthetic Question-Answer pair generation

We employ a rather specialised version of the T5 transformer that has been finetuned on the *question generation* task, which makes things a bit easier, since it is easier to extract a variety of potential answers from the text descriptions, and the question generation pipeline takes care of the rest.


We define a variety of answers such as:

1. living entities (for entity recognition)
2. nouns (for object recognitions)
3. verb (action recognition)
4. prepositional phrases (related to object detection capabilities)
5. colors (descriptions of objects)

In [11]:
import urllib.request
from bs4 import BeautifulSoup

def getcolors():
    html = urllib.request.urlopen('http://www.w3schools.com/colors/colors_names.asp').read()
    soup = BeautifulSoup(html, 'html.parser')
    children = [item.findChildren() for item in soup.find_all('tr')]
    colors = [''.join( ' '+x if 'A' <= x <= 'Z' else x for x in item[0].text.replace(u'\xa0', '')).strip().lower() for item in children]
    return colors[1:]

In [None]:
import spacy
from transformers import AutoModelWithLMHead, AutoTokenizer
from collections import Counter
import random

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load question generation model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")

In [13]:
def get_question(answer, context, max_length=64):
    input_text = f"answer: {answer}  context: {context} </s>"
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'],
                   attention_mask=features['attention_mask'],
                   max_length=max_length)

    return tokenizer.decode(output[0], skip_special_tokens=True)

def extract_potential_answers(description):
    doc = nlp(description)
    potential_answers = []

    potential_answers.extend([ent.text for ent in doc.ents])
    potential_answers.extend([chunk.text for chunk in doc.noun_chunks])

    for token in doc:
        if token.pos_ == "VERB":
            potential_answers.append(token.text)
            if token.dep_ == "ROOT" and token.right_edge.i < len(doc) - 1:
                potential_answers.append(doc[token.i:token.right_edge.i + 1].text)

    adj_noun_pairs = []
    for token in doc:
        if token.pos_ == "ADJ":
            for child in token.children:
                if child.pos_ == "NOUN":
                    adj_noun_pairs.append(f"{token.text} {child.text}")
    potential_answers.extend(adj_noun_pairs)

    prep_phrases = []
    for token in doc:
        if token.pos_ == "ADP" and token.dep_ == "prep":
            phrase = [token.text]
            for child in token.subtree:
                if child != token:
                    phrase.append(child.text)
            prep_phrases.append(" ".join(phrase))
    potential_answers.extend(prep_phrases)

    colors = getcolors()
    color_phrases = [token.sent.text for token in doc if token.text.lower() in colors]
    potential_answers.extend(color_phrases)

    return list(set(potential_answers))

def generate_qa_pairs(description):
    qa_pairs = []
    potential_answers = extract_potential_answers(description)

    sampled_answers = random.sample(potential_answers, min(5, len(potential_answers)))
    for answer in sampled_answers:
        question = get_question(answer, description)
        qa_pairs.append((question, answer))

    # summary based questions
    fixed_questions = [
        "What is happening in the image?",
        "Can you describe the scene?",
        "What's the main focus of this description?",
        "Who are the main characters or objects in this scene?",
        "What's the most striking feature of this description?"
    ]
    for q in random.sample(fixed_questions, 2): #random sampling for diversity
        qa_pairs.append((q, description))

    return qa_pairs

In [None]:
import json
import shutil
from tqdm import tqdm

all_qa_triplets = []
urls = []
tag = 1

for i in tqdm(range(df.shape[0])):
    description = df.loc[i, 'desc']
    url = df.loc[i, 'url'] if 'url' in df.columns else None
    urls.append(url)

    qa_pairs = generate_qa_pairs(description)
    for question, answer in qa_pairs:
        all_qa_triplets.append({'question': question, 'answer': answer, 'url': url})


    # saving checkpoints once every 2500 qa pairs are generated
    while len(all_qa_triplets) >= 2500:
        checkpoint = {
            'qa_triplets': all_qa_triplets[:2500]
        }
        with open(f'checkpoint_{tag}.json', 'w') as f:
            json.dump(checkpoint, f)
        print(f"Checkpoint saved: checkpoint_{tag}.json")
        shutil.copy(f"/content/checkpoint_{tag}.json", "/content/drive/MyDrive")

        all_qa_triplets = all_qa_triplets[2500:]
        tag+=1

if all_qa_triplets:
    checkpoint = {
        'qa_triplets': all_qa_triplets
    }
    with open('checkpoint_final.json', 'w') as f:
        json.dump(checkpoint, f)
    print("Final checkpoint saved: checkpoint_final.json")

## Knowledge Base and Subject-Predicate-Object triplets

Generating Subject-Predicate-Object triplets from the natural language descriptions, can train our model better with respect to action recognition tasks, which is one of the toughest parts involved in VideoQA. Owing to time constraints, we were unable to generate reasonable triplets.

In [None]:
# taken from https://www.nlplanet.org/course-practical-nlp/02-practical-nlp-first-tasks/16-knowledge-graph-from-text

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [None]:
# from https://huggingface.co/Babelscape/rebel-large
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [None]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [None]:
# build a knowledge base from text
def from_small_text_to_kb(text, verbose=False):
    kb = KB()

    # Tokenizer text
    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True,
                            return_tensors='pt')
    if verbose:
        print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # create kb
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            kb.add_relation(r)

    return kb

In [None]:
text = "A little kitten is playing with a dog on the floor in the house"

kb = from_small_text_to_kb(text, verbose=True)
kb.print()

In [None]:
#trying textacy

In [None]:
!pip install textacy

In [None]:
import spacy
import textacy

nlp = spacy.load("en_core_web_sm")

text = nlp(u'A group of men are dancing')

text_ext = textacy.extract.subject_verb_object_triples(text)

In [None]:
list(text_ext)