# Combine Pickled Embeddings and Image ID CSVs

__Date__: 21 March 2023

__Description__: The PhoMemes 2023 Challenge 2 on screenshots requires creating a run-file CSV that maps image IDs to a label of whether that image contains a screenshot or not. Here, we provide scaffolding for how to join the provided embedding files and the CSV indices files for each actor.

In [None]:
import os
import glob
import pickle
import pandas as pd

In [None]:
# First, download the encoded_pickle_indices.tgz archive, which contains CSV
#. files for every actor in the testing dataset. Each row of the CSV contains an index
#. and an image ID, so you can map the row of that actor's embedding matrix to a specific image
# This file is available in Google drive
#. https://drive.google.com/file/d/1a_7OMeSXe8I7U2PfTd6KAgQIQp4sC_xy/view?usp=share_link
embedding_indices_path = "encoded_pickle_indices"

In [None]:
# Now download all 16 of the `*.partition.tar` files from Google Drive.
#. Each partiiton contains separate hashed accounts and pickle files containing
#. matrices of dense embeddings, where each row corresponds to a particular image.

# These files are available in Google Drive:
#. https://drive.google.com/drive/folders/1h8kNnNwN31NPuA-j-6aWih5WSZGU687q?usp=share_link
for f in glob.iglob("sample_embeddings/*-effnet1.pickle"):
    
    # get the User ID
    user_id = f.partition("/")[-1].partition("-")[0]

    # Match user ID to a CSV in the encoded pickle indices
    this_index_path = os.path.join(embedding_indices_path, user_id + ".csv")
    this_index_df = pd.read_csv(this_index_path, index_col="index")
    
    # Load the dense embeddings
    this_user_embeddings = None
    with open(f, "rb") as in_file:
        this_user_embeddings = pickle.load(in_file)
        
    # Now join embeddings and image IDs, leaving a dataframe with image IDs and their embeddings
    this_user_embeddings_df = pd.DataFrame(this_user_embeddings, index=this_index_df["img_id"])
    
    # Do whatever processing here or accumulate all users' embeddings
    pass