##### imSitu images can be downloaded from here: https://prior.allenai.org/projects/imsitu

In [10]:
!pip install transformers
!pip install spacy-transformers



In [11]:
!python -m spacy download en_core_web_trf

  _torch_pytree._register_pytree_node(
Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [12]:
import json
import regex as re
import pandas as pd
import itertools
import os
from collections import Counter
from google.colab import drive
import spacy
import spacy_transformers

In [13]:
# Load spaCy's English model
nlp = spacy.load("en_core_web_trf")

  model.load_state_dict(torch.load(filelike, map_location=device))


In [14]:
# Mount
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# In the working dir you should have the following files (from imSitu dataset):
# train.json
# dev.json
# test.json
# imsitu_space.json
working_dir = '/content/drive/My Drive/deep_learning_project/submition/data/imSitu'

In [16]:
def fix_articles(sentence):
    # Define a regex pattern to capture the articles "a" or "an" followed by a word
    pattern = r'\b(a|an)\s+([a-zA-Z]+)'

    # Function to check if a word starts with a vowel sound
    def correct_article(match):
        article = match.group(1)
        word = match.group(2)

        # List of vowel sounds
        vowels = 'aeiouAEIOU'

        # Determine if the word starts with a vowel
        if word[0] in vowels:
            # If the article is "a" but it should be "an"
            if article == 'a':
                return 'an ' + word
        else:
            # If the article is "an" but it should be "a"
            if article == 'an':
                return 'a ' + word

        # Return the original article and word if no correction is needed
        return article + ' ' + word

    # Use the sub function to replace incorrect "a"/"an" with the correct one
    corrected_sentence = re.sub(pattern, correct_article, sentence)

    return corrected_sentence

def get_full_phrase(token):
    """Helper function to return the full phrase for a given token, including its modifiers."""
    # Collect the token and its modifiers
    phrase_tokens = [token]

    # Add left children (modifiers, determiners, etc.) and sort by token position
    phrase_tokens.extend(list(token.lefts))

    # Sort tokens by their position in the sentence to maintain proper order
    phrase_tokens = sorted(phrase_tokens, key=lambda x: x.i)

    # Return the full phrase
    return ' '.join([t.text for t in phrase_tokens])

def identify_roles(sentence):
    doc = nlp(sentence)
    agent = ""
    patient = ""
    action = ""
    remainder = []

    agent_tokens = set()
    patient_tokens = set()
    action_token = None

    for token in doc:
        # Identify subject (agent) and object (patient)
        if token.dep_ == "nsubj":  # Nominal subject (agent)
            agent = get_full_phrase(token)
            agent_tokens.update([token] + list(token.lefts))  # Collect tokens for agent
        elif token.dep_ == "dobj":  # Direct object (patient)
            patient = get_full_phrase(token)
            patient_tokens.update([token] + list(token.lefts))  # Collect tokens for patient
        elif token.pos_ == "VERB":  # Action (verb)
            action = token.text
            action_token = token  # Store the action token
        else:
            remainder.append(token)  # Collect other tokens as remainder

    # Exclude agent, patient, and action tokens from remainder
    remainder = [token.text for token in remainder if token not in agent_tokens and token not in patient_tokens and token != action_token]

    # Remove any period ('.') from the remainder list
    remainder = [token for token in remainder if token != '.']

    # Return agent, action, patient, and the remainder
    return agent, action, patient, remainder

def switch_roles(sentence):
    agent, action, patient, remainder = identify_roles(sentence)

    # Capitalize first letter and adjust singular/plural of verb if needed
    if agent and patient:
        remainder_text = ' '.join(remainder).strip()

        # Check if there's a remainder and construct the sentence accordingly
        if remainder_text:
            switched_sentence = f"{patient.capitalize()} {action}{'s' if not action.endswith('s') else ''} {agent.lower()} {remainder_text}."
        else:
            switched_sentence = f"{patient.capitalize()} {action}{'s' if not action.endswith('s') else ''} {agent.lower()}."

        return switched_sentence
    else:
        return "Failed"


In [17]:
# Load the content of the files into a dictionaries
with open(os.path.join(working_dir, 'imsitu_space.json'), 'r') as file:
    # Load the content of the file into a dictionary
    imsitu_space_dict = json.load(file)
with open(os.path.join(working_dir, 'train.json'), 'r') as file:
  imsitu_train_dict = json.load(file)
with open(os.path.join(working_dir, 'dev.json'), 'r') as file:
  imsitu_dev_dict = json.load(file)
with open(os.path.join(working_dir, 'test.json'), 'r') as file:
  imsitu_test_dict = json.load(file)
imsitu_dataset_dict = {**imsitu_train_dict, **imsitu_dev_dict, **imsitu_test_dict}
del imsitu_train_dict, imsitu_dev_dict, imsitu_test_dict

In [22]:
df_sorted = pd.DataFrame(columns=["file_name", "text"])
imSitu_filtering_output_filter = f"{working_dir}/../../output/imSitu_filtering_output"
os.makedirs(imSitu_filtering_output_filter, exist_ok=True)

# Regex pattern to extract the part before the underscore
pattern = r'^[^_]+'
for image_key in imsitu_dataset_dict.keys():
    group_key = re.match(pattern, image_key).group()
    frames_list = imsitu_dataset_dict[image_key]['frames']
    combined_dict = {}

    for index, frame in enumerate(frames_list):
        for annot, noun_key in frame.items():
            gloss_value = []
            if noun_key:
                gloss_value = imsitu_space_dict['nouns'][noun_key]['gloss']

            # Combine gloss values directly into the final dict
            if annot not in combined_dict:
                combined_dict[annot] = set()  # Use a set to ensure uniqueness
            combined_dict[annot].update(gloss_value)

    # Convert sets to sorted lists
    for annot in combined_dict:
        combined_dict[annot] = sorted(list(combined_dict[annot]))

    # Adding the abstract directly
    combined_dict['abstract'] = imsitu_space_dict['verbs'][group_key]['abstract']

    # Convert the dictionary to a DataFrame and store it in dfs_dict
    tmp_df = pd.DataFrame([combined_dict])

    # Extract all columns except 'abstract'
    columns = [col for col in tmp_df.columns if col != "abstract"]

    # Create a mapping from uppercase placeholders to actual columns
    words_to_replace = {col.upper(): tmp_df[col].values[0] for col in columns}

    # Create all possible combinations
    combinations = list(itertools.product(*words_to_replace.values()))

    # Generate sentences for all combinations
    descriptions = []
    for combination in combinations:
        sentence = combined_dict['abstract']
        for i, placeholder in enumerate(words_to_replace.keys()):
            # Convert list to a string before replacing
            sentence = sentence.replace(placeholder, combination[i])
        descriptions.append(sentence)
    tmp_df["descriptions"] = [descriptions]
    if len(tmp_df["descriptions"][0]) > 0:
      # Initialize a Counter to track the occurrence of each value across different columns
      value_counter = Counter()
      # Iterate over each column's list and update the counter
      for column in tmp_df.columns:
          if column not in ["abstract", "descriptions"]:
            value_counter.update(set(tmp_df.iloc[0][column]))  # Convert list to set to ignore duplicates within a single column
      # Check for values that appear in more than one list (i.e., have a count > 1)
      duplicates = [value for value, count in value_counter.items() if count > 1]
      if len(duplicates) == 0:
        caption = tmp_df["descriptions"][0][0]
        caption = fix_articles(caption)
        caption = caption.capitalize()
        new_data = {"file_name": f"{working_dir}/of500_images_resized/{image_key}", "text": caption}
        # Convert the new data to a DataFrame and concatenate it
        df_sorted = pd.concat([df_sorted, pd.DataFrame([new_data])], ignore_index=True)
# Due to memory limitation and long runtime save intermidiate result
df_sorted.to_csv(os.path.join(imSitu_filtering_output_filter, 'output.csv'), index=False)



0
1
2
3
4
5
6
7
8
9
10
11


In [23]:
df_sorted = pd.read_csv(os.path.join(imSitu_filtering_output_filter, 'output.csv'))
df_sorted['reverse_text'] = df_sorted['text'].apply(lambda x: switch_roles(x))
df_sorted = df_sorted[df_sorted['reverse_text'] != 'Failed']
df_sorted.to_csv(os.path.join(imSitu_filtering_output_filter, "output_with_reverse.csv"), index=False)

  with torch.cuda.amp.autocast(self._mixed_precision):
