In [1]:
import pandas as pd

In [2]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

tags = pd.read_csv('files/tags.csv').astype(str)
tags_imgs = pd.read_csv('files/tags_images.csv').astype(str)

In [3]:
# mild cleanup to remove newlines
tags['notes'] = tags['notes'].replace({r'\n': ''}, regex=True)

In [4]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in id_list:
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [5]:
# left join to add image_ids to author dataframe
tags = pd.merge(tags, tags_imgs, on='tag_id', how='left')

# combine multiple images per tag into one record per tag
tags = multiValDataFrame(tags, 'tag_id', 'image_id')

In [6]:
# export to csv
tags.to_csv('tags_exp.csv', encoding='utf8', index=False)

In [7]:
# export to json
tags.to_json('tags_exp.json', orient='records', force_ascii=False)