##### imSitu images can be downloaded from here: https://prior.allenai.org/projects/imsitu

In [32]:
import json
import regex as re
import pandas as pd
import itertools
import os
from collections import Counter
from google.colab import drive

In [33]:
# Mount
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
# In the working dir you should have the following files (from imSitu dataset):
# train.json
# dev.json
# test.json
# imsitu_space.json
working_dir = '/content/drive/My Drive/deep_learning_project/imSitu/'

In [35]:
def fix_articles(sentence):
    # Define a regex pattern to capture the articles "a" or "an" followed by a word
    pattern = r'\b(a|an)\s+([a-zA-Z]+)'

    # Function to check if a word starts with a vowel sound
    def correct_article(match):
        article = match.group(1)
        word = match.group(2)

        # List of vowel sounds
        vowels = 'aeiouAEIOU'

        # Determine if the word starts with a vowel
        if word[0] in vowels:
            # If the article is "a" but it should be "an"
            if article == 'a':
                return 'an ' + word
        else:
            # If the article is "an" but it should be "a"
            if article == 'an':
                return 'a ' + word

        # Return the original article and word if no correction is needed
        return article + ' ' + word

    # Use the sub function to replace incorrect "a"/"an" with the correct one
    corrected_sentence = re.sub(pattern, correct_article, sentence)

    return corrected_sentence

In [36]:
with open(working_dir + 'imsitu_space.json', 'r') as file:
    # Load the content of the file into a dictionary
    imsitu_space_dict = json.load(file)
with open(working_dir + 'train.json') as file:
  imsitu_train_dict = json.load(file)
with open(working_dir + 'dev.json') as file:
  imsitu_dev_dict = json.load(file)
with open(working_dir + 'test.json') as file:
  imsitu_test_dict = json.load(file)
imsitu_dataset_dict = {**imsitu_train_dict, **imsitu_dev_dict, **imsitu_test_dict}
del imsitu_train_dict, imsitu_dev_dict, imsitu_test_dict

In [None]:
df_sorted = pd.DataFrame(columns=["file_name", "text"])
os.makedirs(f"{working_dir}data/", exist_ok=True)

# Regex pattern to extract the part before the underscore
pattern = r'^[^_]+'

for image_key in imsitu_dataset_dict.keys():
    group_key = re.match(pattern, image_key).group()
    frames_list = imsitu_dataset_dict[image_key]['frames']
    combined_dict = {}

    for index, frame in enumerate(frames_list):
        for annot, noun_key in frame.items():
            gloss_value = []
            if noun_key:
                gloss_value = imsitu_space_dict['nouns'][noun_key]['gloss']

            # Combine gloss values directly into the final dict
            if annot not in combined_dict:
                combined_dict[annot] = set()  # Use a set to ensure uniqueness
            combined_dict[annot].update(gloss_value)

    # Convert sets to sorted lists
    for annot in combined_dict:
        combined_dict[annot] = sorted(list(combined_dict[annot]))

    # Adding the abstract directly
    combined_dict['abstract'] = imsitu_space_dict['verbs'][group_key]['abstract']

    # Convert the dictionary to a DataFrame and store it in dfs_dict
    tmp_df = pd.DataFrame([combined_dict])

    # Extract all columns except 'abstract'
    columns = [col for col in tmp_df.columns if col != "abstract"]

    # Create a mapping from uppercase placeholders to actual columns
    words_to_replace = {col.upper(): tmp_df[col].values[0] for col in columns}

    # Create all possible combinations
    combinations = list(itertools.product(*words_to_replace.values()))

    # Generate sentences for all combinations
    descriptions = []
    for combination in combinations:
        sentence = combined_dict['abstract']
        for i, placeholder in enumerate(words_to_replace.keys()):
            # Convert list to a string before replacing
            sentence = sentence.replace(placeholder, combination[i])
        descriptions.append(sentence)
    tmp_df["descriptions"] = [descriptions]
    if len(tmp_df["descriptions"][0]) > 0:
      # Initialize a Counter to track the occurrence of each value across different columns
      value_counter = Counter()
      # Iterate over each column's list and update the counter
      for column in tmp_df.columns:
          if column not in ["abstract", "descriptions"]:
            value_counter.update(set(tmp_df.iloc[0][column]))  # Convert list to set to ignore duplicates within a single column
      # Check for values that appear in more than one list (i.e., have a count > 1)
      duplicates = [value for value, count in value_counter.items() if count > 1]
      if len(duplicates) == 0:
        caption = tmp_df["descriptions"][0][0]
        caption = fix_articles(caption)
        caption = caption.capitalize()
        new_data = {"file_name": f"{working_dir}data/full_resized_images_dataset/of500_images_resized/{image_key}", "text": caption}
        # Convert the new data to a DataFrame and concatenate it
        df_sorted = pd.concat([df_sorted, pd.DataFrame([new_data])], ignore_index=True)

df_sorted.to_csv(f"{working_dir}data/output.csv", index=False)
#del imsitu_dataset_dict
