In [56]:
# Define libraries
import pandas as pd
from torchvision import io

lifelog_dir = '../Lifelog-6' # Lifelog-6 directory
target_month = '202003' # Target month of topic file
N_data = 200 # Number of data (documents) in each topic

basic_columns = ['topic_id', 'dmr_minute_id', 'input_modal', 'output_modal']
image_columns = ['image']
sensor_columns = ['heart_rate(bpm)', 'heart_rate_conf', 'calories', 'distance', 'artist name', 'song name', 'album name',
                  'sleep_level', 'awake', 'minutesToFallAsleep', 'minutesAsleep',
                  'minutesAwake', 'minutesAfterWakeup', 'timeInBed', 'sleep_efficiency']
location_columns = ['stop', 'new_lat', 'new_lng', 'foursquare_id', 'original_name', 'categories', 'parent', 'movement', 'movement_prob',
                    'city', 'country', 'new_timezone', 'latitude', 'longitude', 'altitude', 'semantic_name', 'time_zone']
target_columns = basic_columns + image_columns + sensor_columns + location_columns

# Load Lifelog-6 dataset
df_sen = pd.read_csv(f'{lifelog_dir}/lsc22_metadata.csv', low_memory=False)
df_loc = pd.read_csv(f'{lifelog_dir}/vaisl_gps.csv', low_memory=False)
df_loc = df_loc.drop_duplicates(subset=['minute_id'])
df = pd.merge(df_loc, df_sen, how='left', on='minute_id', suffixes=('', '_y'))

df = df.dropna(subset=['ImageID'])
df['local_time'] = pd.to_datetime(df['ImageID'], format='%Y%m%d_%H%M%S_000.jpg')

df['month'] = df['ImageID'].map(lambda x:x[0:6])
df['day'] = df['ImageID'].map(lambda x:x[6:8])
df['target_day'] = df['month'] + df['day']
df['local_time'] = pd.to_datetime(df['ImageID'], format='%Y%m%d_%H%M%S_000.jpg')
df['ImageID_full'] = lifelog_dir + '/images/' + df['month'] + '/' + df['day'] + '/' + df['ImageID']

df = df[df.month==target_month]
df['dmr_minute_id'] = range(1, len(df)+1)
df['dmr_minute_id'] = df['dmr_minute_id'].map(lambda x: str(x).zfill(5))

In [61]:
df_topics = pd.DataFrame() # For topic data
df_qrels = pd.DataFrame() # For qrel data

# Create image to sensor topics
for i, day in enumerate(df['day'].unique(), start=1):
    topic = df[df['day']==day].sample(N_data).copy()
    target = topic.sample(1, random_state=0).copy()
    image = io.read_image(path=target['ImageID_full'].values[0])

    topic['topic_id'] = f'img2sen_{str(i).zfill(3)}'
    topic['input_modal'] = 'image'
    topic['output_modal'] = 'sensor'

    topic['ImageID'] = target['ImageID'].values[0]
    topic['image'] = [image.numpy()] * len(topic)

    df_topics = pd.concat([df_topics, topic])

    topic['rel'] = 0
    topic.loc[topic['minute_id']==target['minute_id'].values[0], 'rel'] = 1
    qrel = topic[['topic_id','dmr_minute_id','minute_id','rel']]
    df_qrels = pd.concat([df_qrels, qrel])

# Create sensor to image topics
for i, day in enumerate(df['day'].unique(), start=1):
    topic = df[df['day']==day].sample(N_data)
    target = topic.sample(1, random_state=0).copy()

    topic['topic_id'] = f'sen2img_{str(i).zfill(3)}'
    topic['input_modal'] = 'sensor'
    topic['output_modal'] = 'image'

    for col in sensor_columns+location_columns:
        topic[col] = target[col].values[0]

    image_list = []
    for image_path in topic['ImageID_full']:
        image = io.read_image(path=image_path)
        image_list.append(image.numpy())
    topic['image'] = image_list

    df_topics = pd.concat([df_topics, topic])

    topic['rel'] = 0
    topic.loc[topic['minute_id']==target['minute_id'].values[0], 'rel'] = 1
    qrel = topic[['topic_id','dmr_minute_id','minute_id','rel']]
    df_qrels = pd.concat([df_qrels, qrel])

In [62]:
df_topics[target_columns].to_json('../input/input_test.json', orient='records')
df_qrels.to_csv('../input/qrels_test.csv', index=False)