In [1]:
import os
import pandas as pd
import glob
import json
from IPython.display import display

In [2]:
# Data paths
data_folder_path = '/mnt/4TBSSD/nvtu/LSC23_Data_Mount'
metadata_path = os.path.join(data_folder_path, 'lsc22_metadata.csv')
visual_concept_path = os.path.join(data_folder_path, 'lsc22_visual_concepts.csv')
gps_data_path = os.path.join(data_folder_path, 'vaisl_gps.csv')

# Load data
metadata = pd.read_csv(metadata_path, low_memory=False)
vc_data = pd.read_csv(visual_concept_path, low_memory=False)
gps_data = pd.read_csv(gps_data_path, low_memory=False)

In [3]:
print(metadata.columns)
print(vc_data.columns)
print(gps_data.columns)
display(metadata.head())
display(vc_data.head())
display(gps_data.head())
display(metadata.head())

Index(['Unnamed: 0', 'minute_id', 'utc_time', 'local_time', 'latitude',
       'longitude', 'altitude', 'semantic_name', 'time_zone',
       'heart_rate(bpm)', 'heart_rate_conf', 'calories', 'distance',
       'artist name', 'song name', 'album name', 'sleep_level', 'awake',
       'minutesToFallAsleep', 'minutesAsleep', 'minutesAwake',
       'minutesAfterWakeup', 'timeInBed', 'sleep_efficiency', 'ImageID'],
      dtype='object')
Index(['Unnamed: 0', 'ImageID', 'Tags', 'OCR', 'Caption', 'CaptionScore'], dtype='object')
Index(['ImageID', 'minute_id', 'stop', 'new_lat', 'new_lng', 'semantic_name',
       'foursquare_id', 'original_name', 'categories', 'parent', 'movement',
       'movement_prob', 'city', 'country', 'new_timezone'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,minute_id,utc_time,local_time,latitude,longitude,altitude,semantic_name,time_zone,heart_rate(bpm),...,album name,sleep_level,awake,minutesToFallAsleep,minutesAsleep,minutesAwake,minutesAfterWakeup,timeInBed,sleep_efficiency,ImageID
0,0,20190101_0000,,,,,,,,85.0,...,,,,0.0,299.0,52.0,5.0,351.0,90.0,
1,1,20190101_0001,,,,,,,,88.0,...,Sleep: 111 Pieces Of Classical Music For Bedtime,,,0.0,299.0,52.0,5.0,351.0,90.0,
2,2,20190101_0002,,,,,,,,89.0,...,,,,0.0,299.0,52.0,5.0,351.0,90.0,
3,3,20190101_0003,,,,,,,,88.0,...,,,,0.0,299.0,52.0,5.0,351.0,90.0,
4,4,20190101_0004,,,,,,,,84.0,...,,,,0.0,299.0,52.0,5.0,351.0,90.0,


Unnamed: 0.1,Unnamed: 0,ImageID,Tags,OCR,Caption,CaptionScore
0,0,20000101_000113_000.jpg,"wall,bathroom,indoor,toilet,tile,tiled",,a mirror and a shelf,0.215212
1,1,20000101_000145_000.jpg,"text,wall,indoor",,a room with a tv and a window,0.440993
2,2,20000101_000217_000.jpg,"text,wall,indoor",,a wall with a light on it,0.465437
3,3,20000101_000249_000.jpg,text,,a wall with a light on it,0.464277
4,4,20000101_000321_000.jpg,"text,wall,indoor",,a wall with a light on it,0.456158


Unnamed: 0,ImageID,minute_id,stop,new_lat,new_lng,semantic_name,foursquare_id,original_name,categories,parent,movement,movement_prob,city,country,new_timezone
0,20190101_103717_000.jpg,20190101_1037,True,53.38998,-6.14576,HOME,,,,,Inside,0.981373,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin
1,20190101_103749_000.jpg,20190101_1037,True,53.38998,-6.14576,HOME,,,,,Inside,0.997463,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin
2,20190101_103821_000.jpg,20190101_1038,True,53.38998,-6.14576,HOME,,,,,Inside,0.875969,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin
3,20190101_103853_000.jpg,20190101_1038,True,53.38998,-6.14576,HOME,,,,,Inside,0.998937,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin
4,20190101_103925_000.jpg,20190101_1039,True,53.38998,-6.14576,HOME,,,,,Inside,0.756499,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin


Unnamed: 0.1,Unnamed: 0,minute_id,utc_time,local_time,latitude,longitude,altitude,semantic_name,time_zone,heart_rate(bpm),...,album name,sleep_level,awake,minutesToFallAsleep,minutesAsleep,minutesAwake,minutesAfterWakeup,timeInBed,sleep_efficiency,ImageID
0,0,20190101_0000,,,,,,,,85.0,...,,,,0.0,299.0,52.0,5.0,351.0,90.0,
1,1,20190101_0001,,,,,,,,88.0,...,Sleep: 111 Pieces Of Classical Music For Bedtime,,,0.0,299.0,52.0,5.0,351.0,90.0,
2,2,20190101_0002,,,,,,,,89.0,...,,,,0.0,299.0,52.0,5.0,351.0,90.0,
3,3,20190101_0003,,,,,,,,88.0,...,,,,0.0,299.0,52.0,5.0,351.0,90.0,
4,4,20190101_0004,,,,,,,,84.0,...,,,,0.0,299.0,52.0,5.0,351.0,90.0,


In [10]:
fault_idx = vc_data.loc[vc_data['ImageID'].apply(lambda x: x.startswith('2000'))].index
vc_data = vc_data.drop(fault_idx, axis=0)

In [11]:
metadata = metadata.drop(metadata[metadata['ImageID'].isnull()].index, axis=0)
metadata = metadata.rename({
    'ImageID': 'ImageIDs'
}, axis=1)

# Split the ImageIDs into ImageID
metadata = (
 metadata.assign(ImageID=metadata['ImageIDs'].apply(eval))
   .explode('ImageID')
   .reset_index(drop=True)
)

# Drop unused columns
metadata = metadata.drop([
    'ImageIDs', 'latitude',
       'longitude', 'altitude', 'minute_id', 'semantic_name',
], axis=1)
vc_data = vc_data.drop('Unnamed: 0', axis=1)
metadata = metadata.drop('Unnamed: 0', axis=1)


Unnamed: 0,utc_time,local_time,time_zone,heart_rate(bpm),heart_rate_conf,calories,distance,artist name,song name,album name,sleep_level,awake,minutesToFallAsleep,minutesAsleep,minutesAwake,minutesAfterWakeup,timeInBed,sleep_efficiency,ImageID
0,,,,116.0,2.0,5.94,2070.0,,,,,,0.0,299.0,52.0,5.0,351.0,90.0,20190101_103717_000.jpg
1,,,,116.0,2.0,5.94,2070.0,,,,,,0.0,299.0,52.0,5.0,351.0,90.0,20190101_103749_000.jpg
2,,,,114.0,2.0,5.23,2930.0,,,,,,0.0,299.0,52.0,5.0,351.0,90.0,20190101_103821_000.jpg
3,,,,114.0,2.0,5.23,2930.0,,,,,,0.0,299.0,52.0,5.0,351.0,90.0,20190101_103853_000.jpg
4,,,,119.0,1.0,5.70,350.0,,,,,,0.0,299.0,52.0,5.0,351.0,90.0,20190101_103925_000.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723324,2020-06-30 21:18:13,2020-06-30 22:18:13,Europe/Dublin,75.0,3.0,1.40,,,,,,,0.0,370.0,18.0,0.0,388.0,95.0,20200630_211803_000.jpg
723325,2020-06-30 21:19:31,2020-06-30 22:19:31,Europe/Dublin,76.0,3.0,1.40,,,,,,,0.0,370.0,18.0,0.0,388.0,95.0,20200630_211908_000.jpg
723326,2020-06-30 21:21:09,2020-06-30 22:21:09,Europe/Dublin,76.0,3.0,1.40,0.0,,,,,,0.0,370.0,18.0,0.0,388.0,95.0,20200630_212119_000.jpg
723327,2020-06-30 21:21:09,2020-06-30 22:21:09,Europe/Dublin,76.0,3.0,1.40,0.0,,,,,,0.0,370.0,18.0,0.0,388.0,95.0,20200630_212152_000.jpg


In [12]:
# Merge based on ImageID
overall_df = pd.merge(vc_data, metadata, on='ImageID', how='outer')
overall_df = pd.merge(overall_df, gps_data, on='ImageID', how='outer')
overall_df['ImageID'] = overall_df.apply(lambda x: x['ImageID'].split('.jpg')[0], axis=1)

In [13]:
overall_df.isna().sum().sum()

8878352

In [None]:
overall_df

In [14]:
overall_df = overall_df.set_index('ImageID')
json_dict = overall_df.to_json(orient="index")
parsed = json.loads(json_dict)

In [15]:
parsed

{'20190101_103717_000': {'Tags': 'indoor',
  'OCR': None,
  'Caption': 'a window with a curtain',
  'CaptionScore': 0.3678438365,
  'utc_time': None,
  'local_time': None,
  'time_zone': None,
  'heart_rate(bpm)': 116.0,
  'heart_rate_conf': 2.0,
  'calories': 5.94,
  'distance': 2070.0,
  'artist name': None,
  'song name': None,
  'album name': None,
  'sleep_level': None,
  'awake': None,
  'minutesToFallAsleep': 0.0,
  'minutesAsleep': 299.0,
  'minutesAwake': 52.0,
  'minutesAfterWakeup': 5.0,
  'timeInBed': 351.0,
  'sleep_efficiency': 90.0,
  'minute_id': '20190101_1037',
  'stop': True,
  'new_lat': 53.38998,
  'new_lng': -6.1457602,
  'semantic_name': 'HOME',
  'foursquare_id': None,
  'original_name': None,
  'categories': None,
  'parent': None,
  'movement': 'Inside',
  'movement_prob': 0.981372714,
  'city': 'Dublin, Ireland, Leinster',
  'country': 'Ireland',
  'new_timezone': 'Europe/Dublin'},
 '20190101_103749_000': {'Tags': 'wall,indoor,room,furniture',
  'OCR': None,


In [16]:
json.dump(parsed, open('processed_metadata.json', 'w'))

In [17]:
len(parsed.keys())

723329