# Main

In [None]:
DATA_DIR = 'data/FLUME'
PROJECT_NAME = "FLUME"
TEXT_FEATURES = ["title", "description", "summary"]

In [None]:
import pandas as pd
import json
import numpy as np
import nltk
from tqdm import tqdm
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Train

In [None]:
def create_train_scene_graph(text, word2index):
    doc = nlp(text)
    scene_graph = []

    for sentence in doc.sents:
        subject = ""
        verb = ""
        object = ""
        for token in sentence:
            if token.dep_=='punct':
              continue
            if token not in word2index:
              word2index[str(token)] = len(word2index)
            if "subj" in token.dep_ and token.dep_!='nsubjpass':
                subject = token.text
            elif "obj" in token.dep_:
                object = token.text
            elif "ROOT" in token.dep_:
                verb = token.text
        if subject and verb and object:
            scene_graph.append((subject, verb, object))
    return scene_graph, word2index

In [None]:
issues = pd.read_csv(
    f"{DATA_DIR}/train_issues.csv")
issues = issues.fillna(" ")
first_feature = TEXT_FEATURES[0]
issues["text"] = issues[first_feature]
if len(TEXT_FEATURES)>1:
  for feature in TEXT_FEATURES[1:]:
    issues["text"] = issues["text"] + ". " + issues[feature]

In [None]:
texts = issues["text"].values
keys = issues["Unnamed: 0"].values

In [None]:
scene_graphs = []
word2index = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}
for text in tqdm(texts):
  scene_graph, word2index = create_train_scene_graph(text, word2index)
  scene_graphs.append({"text": text, "rels": scene_graph})

100%|██████████| 2521/2521 [01:01<00:00, 40.91it/s]


In [None]:
scene_graphs = dict(zip(keys, scene_graphs))

In [None]:
with open(f"{DATA_DIR}/train_scene_graphs.json", "w") as json_file:
    json.dump(scene_graphs, json_file)

In [None]:
with open(f"{DATA_DIR}/word2index.json", "w") as json_file:
    json.dump(word2index, json_file)

# Test

In [None]:
def create_test_scene_graph(text, word2index):
    doc = nlp(text)
    scene_graph = []

    for sentence in doc.sents:
        subject = ""
        verb = ""
        object = ""
        for token in sentence:
            if token.dep_=='punct':
              continue
            if token not in word2index:
              continue
            if "subj" in token.dep_ and token.dep_!='nsubjpass':
                subject = token.text
            elif "obj" in token.dep_:

                object = token.text
            elif "ROOT" in token.dep_:
                verb = token.text
        if subject and verb and object:
            scene_graph.append((subject, verb, object))
    return scene_graph

In [None]:
issues = pd.read_csv(
    f"{DATA_DIR}/test_issues.csv")
issues = issues.fillna(" ")
first_feature = TEXT_FEATURES[0]
issues["text"] = issues[first_feature]
if len(TEXT_FEATURES)>1:
  for feature in TEXT_FEATURES[1:]:
    issues["text"] = issues["text"] + ". " + issues[feature]

In [None]:
texts = issues["text"].values
keys = issues["Unnamed: 0"].values

In [None]:
scene_graphs = []
with open(f"{DATA_DIR}/word2index.json", 'r') as file:
    word2index = json.load(file)
for text in tqdm(texts):
  scene_graph = create_test_scene_graph(text, word2index)
  scene_graphs.append({"text": text, "rels": scene_graph})

100%|██████████| 934/934 [00:32<00:00, 28.76it/s]


In [None]:
scene_graphs = dict(zip(keys, scene_graphs))

In [None]:
with open(f"{DATA_DIR}/test_scene_graphs.json", "w") as json_file:
    json.dump(scene_graphs, json_file)