In [1]:
import pandas as pd

In [2]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

images = pd.read_csv('in/imagesonline.csv').astype(str)
characters_images = pd.read_csv('in/characters_images.csv').astype(str) 
kashira_images = pd.read_csv('in/kashira_images.csv').astype(str)
performances_images = pd.read_csv('in/performances_images.csv').astype(str) 
performers_images = pd.read_csv('in/performers_images.csv').astype(str)
plays_images = pd.read_csv('in/plays_images.csv').astype(str) 
productions_images = pd.read_csv('in/productions_images.csv').astype(str) 
scenes_images = pd.read_csv('in/scenes_images.csv').astype(str) 
tags_images = pd.read_csv('in/tags_images.csv').astype(str)

In [3]:
images = images.drop('online', 1)

In [4]:
images = images.replace({'Barbara C. Adachi':'1','Fukuda Fumio':'2','Harri Peccinotti':'3','M. Arai':'4','Toyotake Komatsudayū II':'5','Unknown. Photo: Columbia University Libraries':'6'})


In [5]:
# Optional function fot IPy ProgressBar 
# (since multiValDataFrame can take awhile on large frames)

def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [6]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in log_progress(id_list):
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [7]:
# merge and multival characters
images = pd.merge(images, characters_images, on='image_id', how='left')
images = multiValDataFrame(images, 'image_id', 'character_id')

Widget Javascript not detected.  It may not be installed or enabled properly.


In [8]:
# merge and multival kashira
images = pd.merge(images, kashira_images, on='image_id', how='left')
images = multiValDataFrame(images, 'image_id', 'kashira_id')

Widget Javascript not detected.  It may not be installed or enabled properly.


In [9]:
# merge and multival performances
images = pd.merge(images, performances_images, on='image_id', how='left')
images = multiValDataFrame(images, 'image_id', 'performance_id')

Widget Javascript not detected.  It may not be installed or enabled properly.


In [10]:
# merge and multival performers
images = pd.merge(images, performers_images, on='image_id', how='left')
images = multiValDataFrame(images, 'image_id', 'performer_id')

Widget Javascript not detected.  It may not be installed or enabled properly.


In [11]:
# merge and multival plays
images = pd.merge(images, plays_images, on='image_id', how='left')
images = multiValDataFrame(images, 'image_id', 'play_id')

Widget Javascript not detected.  It may not be installed or enabled properly.


In [12]:
# merge and multival productions
images = pd.merge(images, productions_images, on='image_id', how='left')
images = multiValDataFrame(images, 'image_id', 'production_id')

Widget Javascript not detected.  It may not be installed or enabled properly.


In [13]:
# merge and multival scenes
images = pd.merge(images, scenes_images, on='image_id', how='left')
images = multiValDataFrame(images, 'image_id', 'scene_id')

Widget Javascript not detected.  It may not be installed or enabled properly.


In [14]:
# merge and multival tags
images = pd.merge(images, tags_images, on='image_id', how='left')
images = multiValDataFrame(images, 'image_id', 'subject_id')

Widget Javascript not detected.  It may not be installed or enabled properly.


In [15]:
# mild cleaning + rearranging
images = images.rename(columns = {'subject_id':'tag_id', 'image_id':'id', 'sequence_number':'sequence', 'series_number':'series','notes_on_item':'notes', 'slidepage_folder_no':'slidepage_folder','container_number':'container'})
images = images[['id','media_type','character_id','tag_id','kashira_id','performance_id','performer_id','play_id','production_id','scene_id','container','container_type','creator','item_id','colser_id','notes','objid','sequence','series','slidepage_folder']]


In [16]:
# for i in images['container']:
#     if i != "nan":
#         i = int(i)
   
# containers = images['container']    
# containers_mapped = map(lambda x: int(float(x)) if x != "nan", containers)
# images.head()

In [21]:
# export to csv
images.to_csv('images_exp.csv', encoding='utf8', index=False)

In [22]:
# export to json
images.to_json('images_exp.json', orient='records', force_ascii=False)

In [20]:
images.head()

Unnamed: 0,id,media_type,character_id,tag_id,kashira_id,performance_id,performer_id,play_id,production_id,scene_id,container,container_type,creator,item_id,colser_id,notes,objid,sequence,series,slidepage_folder
0,8281,slide,"[452, 453, 454, 455, 456, 2312]","[43, 73]","[33, 3, 35, 106, 77]",[310],[nan],[26],[86],[128],46.0,Slide Binder,1,1,2,2,ldpd_bun_slide_452_2_0001_0001,1,2.0,452.0
1,8282,slide,"[452, 453, 454, 455, 456, 2312]","[43, 73]","[33, 3, 35, 106, 77]",[310],[nan],[26],[86],[128],46.0,Slide Binder,1,2,2,6,ldpd_bun_slide_452_2_0002_0002,2,2.0,452.0
2,8283,slide,"[452, 453, 454, 455, 456, 2312]","[43, 73]","[33, 3, 35, 106, 77]",[310],[nan],[26],[86],[128],46.0,Slide Binder,1,3,2,7,ldpd_bun_slide_452_2_0003_0003,3,2.0,452.0
3,8284,slide,"[453, 454, 455, 452, 2312]","[43, 73]","[3, 35, 33, 77]",[310],[nan],[26],[86],[128],46.0,Slide Binder,1,4,2,0,ldpd_bun_slide_452_2_0004_0004,4,2.0,452.0
4,8285,slide,"[452, 454, 2312]",[43],"[33, 35, 77]",[310],[nan],[26],[86],[128],46.0,Slide Binder,1,5,2,4,ldpd_bun_slide_452_2_0005_0005,5,2.0,452.0
