In [None]:
import pandas as pd
import numpy as np

In [None]:
# pd.set_option('max_colwidth',1000)
# pd.set_option('max_seq_items','none')
# read csvs as data frames
images = pd.read_csv('files/imagesonline.csv').astype(str)
characters_images = pd.read_csv('files/characters_images.csv').astype(str) 
kashira_images = pd.read_csv('files/kashira_images.csv').astype(str)
performances_images = pd.read_csv('files/performances_images.csv').astype(str) 
performers_images = pd.read_csv('files/performers_images.csv').astype(str)
plays_images = pd.read_csv('files/plays_images.csv').astype(str) 
productions_images = pd.read_csv('files/productions_images.csv').astype(str) 
scenes_images = pd.read_csv('files/scenes_images.csv').astype(str) 
tags_images = pd.read_csv('files/tags_images.csv').astype(str)

In [None]:
images = images.drop('online', 1)
len(images.index)

In [None]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [None]:
def multiValDataFrame(df, table_id, multival_id):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[table_id].unique()

    for i in log_progress(id_list):
        temp_row = df.loc[df[table_id] == i ]
        keys = []
        vals = []
        for col in cols:
            if len(str(temp_row[:1][col].values[0])) > 0:
                keys.append(col)
                vals.append(temp_row[:1][col].values[0])
        
        if len(vals) <1:
            vals =''
        
        temp_dict = dict(zip(keys, vals))
        multi_id_list = list(temp_row[multival_id])
    
        temp_dict[multival_id] = multi_id_list
        
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [None]:
# merge and multival characters
images = pd.merge(images, characters_images, on='image_id', how='left')

In [None]:
images = multiValDataFrame(images, 'image_id', 'character_id')

In [None]:
# len(images.index)
images[:10]

In [None]:
# merge and multival kashira
images = pd.merge(images, kashira_images, on='image_id', how='left')
len(images.index)

In [None]:
images = multiValDataFrame(images, 'image_id', 'kashira_id')
len(images.index)

In [None]:
# merge and multival performances
images = pd.merge(images, performances_images, on='image_id', how='left')
len(images.index)

In [None]:
images = multiValDataFrame(images, 'image_id', 'performance_id')
len(images.index)

In [None]:
# merge and multival performers
images = pd.merge(images, performers_images, on='image_id', how='left')
len(images.index)

In [None]:
images = multiValDataFrame(images, 'image_id', 'performer_id')
len(images.index)

In [None]:
# merge and multival plays
images = pd.merge(images, plays_images, on='image_id', how='left')
len(images.index)

In [None]:
images = multiValDataFrame(images, 'image_id', 'play_id')
len(images.index)

In [None]:
# merge and multival productions
images = pd.merge(images, productions_images, on='image_id', how='left')
len(images.index)

In [None]:
images = multiValDataFrame(images, 'image_id', 'production_id')
len(images.index)

In [None]:
# merge and multival scenes
images = pd.merge(images, scenes_images, on='image_id', how='left')
len(images.index)

In [None]:
images = multiValDataFrame(images, 'image_id', 'scene_id')
len(images.index)

In [None]:
# merge and multival tags
images = pd.merge(images, tags_images, on='image_id', how='left')
len(images.index)

In [None]:
images = multiValDataFrame(images, 'image_id', 'subject_id')
len(images.index)

In [None]:
images.rename(columns = {'subject_id':'tag_id'}, inplace = True)

In [None]:
images = images[['image_id','media_type','character_id','tag_id','kashira_id','performance_id','performer_id','play_id','production_id','scene_id','container_number','container_type','creator','item_id','colser_id','notes_on_item','objid','sequence_number','series_number','slidepage_folder_no']]

In [None]:
images[:10]

In [None]:
images = images.fillna('')
images

In [None]:
len(images['image_id'].unique())

In [None]:
csvname = 'images_exp.csv'
images.fillna('').to_csv(csvname, encoding='utf8', index=False)

In [None]:
jsonname = 'images_exp.json'
images.to_json(jsonname, orient='records', force_ascii=False)

In [None]:
len(images.index)