

# Bunraku Full (On-/Off-line) Collection 
__Data Transformation SQL CSVs -> JSON__

<hr/>

### Data files to generate:

- __authors: 123__
- __characters: 2,107__ — 17,006 images
- __images: 21268__
- __kashira: 129__ — 17,019 images
- __performances: 931__ — 18,533 images
- __performers: 184__ — 14,893 images
- __plays: 178__ — 18,791 images
- __productions: 293__ — 18,750 images
- __pscenes: 2,609__ — 18,000 images
- __spucks: 16,625__
- __tags: 76__ — 8,941 images



# Setup
<hr/>

In [1]:
import pandas as pd

pd.set_option('max_colwidth',1000)
pd.set_option('max_seq_items','none')
pd.set_option('display.max_colwidth',50)

path_to_csvs = "in/"

# Methods
<hr/>

In [2]:
def df(csv_name): 
    return pd.read_csv(path_to_csvs + csv_name + '.csv', dtype=object).drop_duplicates()

def remove_offline(df):
    return pd.merge(df, imagesonline, on='image_id', how='inner')

def merge_id_array(base_df, other_df, main_id, multi_id):
    return base_df.merge(other_df.groupby(main_id)[multi_id].apply(list).reset_index(), how='left')

def re_id(df,key):
    df.rename(columns={key:'id'}, inplace=True)
    
def csv_out(df,filename):
    df.to_csv(filename, encoding='utf8', index=False)
    
def json_out(df, filename):
    df.to_json(filename, orient="records", force_ascii=False)

# Import
<hr/>

In [3]:
authors = df('authors')
characters = df('characters')
creators = df('creators')
images = df('imagesonline')
kashira = df('kashira')
performances = df('performances')
performers = df('performers')
plays = df('plays')
productions = df('productions')
pscenes = df('scenes_productions')
scenes = df('scenes')
shamisenplayers = df('sceneshamisens')
musicians = df('scenekotokokyus')
narrators = df('scenetayus')
spucks = df('spucks')
tags = df('tags')

authors_plays = df('authors_plays')
characters_images = df('characters_images')
characters_plays = df('characters_plays')
kashira_images = df('kashira_images')
kashira_plays = df('kashira_plays')
performances_images = df('performances_images')
performers_images = df('performers_images')
plays_images = df('plays_images')
productions_images = df('productions_images')
scenes_images = df('scenes_images')
tags_images = df('tags_images')

### Authors

In [4]:
authors = merge_id_array(authors, authors_plays, 'author_id', 'play_id')

### Characters

In [5]:
characters = characters.drop('character_code',1)

characters = merge_id_array(characters, characters_plays, 'character_id', 'play_id')
characters = merge_id_array(characters, characters_images, 'character_id', 'image_id')

### Images

In [6]:
images = merge_id_array(images, characters_images, 'image_id', 'character_id')
images = merge_id_array(images, kashira_images, 'image_id', 'kashira_id')
images = merge_id_array(images, performances_images, 'image_id', 'performance_id')
images = merge_id_array(images, performers_images, 'image_id', 'performer_id')
images = merge_id_array(images, plays_images, 'image_id', 'play_id')
images = merge_id_array(images, productions_images, 'image_id', 'production_id')
images = merge_id_array(images, scenes_images, 'image_id', 'pscene_id')
images = merge_id_array(images, tags_images, 'image_id', 'tag_id')

images = images[['image_id','media_type','character_id','tag_id','kashira_id','performance_id','performer_id','play_id','production_id','pscene_id','container','container_type','creator','item_id','colser_id','notes','objid','sequence','series','slidepage_folder']]


### Kashira (puppets)

In [7]:
kashira = merge_id_array(kashira, kashira_images, 'kashira_id', 'image_id')
kashira = merge_id_array(kashira, kashira_plays, 'kashira_id', 'play_id')

kashira = kashira[['kashira_id','label_eng','label_ka','category','image_id','play_id','sort_ja']]


### (P)Scenes (at performance level)

In [8]:
# add labels and scene_order
scenes = scenes[['scene_id','label_eng','label_ja','label_ka','scene_order']]
pscenes = pscenes.drop('spuck_note',1).drop('tayu_shamisen_note',1)
pscenes = pscenes.merge(scenes, how='left')

pscenes = merge_id_array(pscenes, spucks, 'pscene_id', 'spuck_id')
pscenes = merge_id_array(pscenes, narrators, 'pscene_id', 'narrator_id')
pscenes = merge_id_array(pscenes, musicians, 'pscene_id', 'musician_id')
pscenes = merge_id_array(pscenes, shamisenplayers, 'pscene_id', 'shamisen_id')
pscenes = merge_id_array(pscenes, scenes_images, 'pscene_id', 'image_id')

### Performances

In [9]:
performances =  performances.drop('data_id',1).drop('code',1)

performances = merge_id_array(performances, performances_images, 'performance_id', 'image_id')
performances = merge_id_array(performances, pscenes, 'performance_id', 'pscene_id')
performances = merge_id_array(performances,characters_plays,'play_id','character_id')

### Plays

In [10]:
plays = merge_id_array(plays,authors_plays,'play_id','author_id')
plays = merge_id_array(plays,characters_plays,'play_id','character_id')
plays = merge_id_array(plays,plays_images,'play_id','image_id')
plays = merge_id_array(plays,performances,'play_id','production_id')
plays = merge_id_array(plays,performances,'play_id','performance_id')

### Productions

In [11]:
productions = productions.drop('performance_num',1)

productions = merge_id_array(productions,productions_images,'production_id','image_id')
productions = merge_id_array(productions,performances,'production_id','performance_id')
productions = merge_id_array(productions,performances,'production_id','play_id')

productions = productions[['production_id','dates','place','label_eng','image_id','performance_id','play_id']]

### Tags

In [12]:
tags['notes'] = tags['notes'].replace({r'\n': ''}, regex=True)
tags = merge_id_array(tags,tags_images,'tag_id','image_id')

### Performers

In [13]:
performers = performers[['performer_id','name_proper','alt_name','name_ka','alt_name_ka','specialty','dates','notes']]

In [14]:
performers = merge_id_array(performers,performers_images,'performer_id','image_id') 

performer_as_musician = pscenes[['musician_id','performance_id']].dropna(how='any').rename(columns={'musician_id':'performer_id','performance_id':'musician_perf_id'})
performer_as_musician = performer_as_musician.groupby('musician_perf_id').performer_id.apply(lambda x: pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis = 1)
performer_as_musician.columns = ['musician_perf_id','performer_id']
performer_as_musician.drop_duplicates(inplace=True)

performers = merge_id_array(performers,performer_as_musician,'performer_id','musician_perf_id')

performer_as_narrator = pscenes[['narrator_id','performance_id']].dropna(how='any').rename(columns={'narrator_id':'performer_id','performance_id':'narrator_perf_id'})
performer_as_narrator = performer_as_narrator.groupby('narrator_perf_id').performer_id.apply(lambda x: pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis = 1)
performer_as_narrator.columns = ['narrator_perf_id','performer_id']
performer_as_narrator.drop_duplicates(inplace=True)

performers = merge_id_array(performers,performer_as_narrator,'performer_id','narrator_perf_id')

performer_as_shamisen = pscenes[['shamisen_id','performance_id']].dropna(how='any').rename(columns={'shamisen_id':'performer_id','performance_id':'shamisen_perf_id'})
performer_as_shamisen = performer_as_shamisen.groupby('shamisen_perf_id').performer_id.apply(lambda x: pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis = 1)
performer_as_shamisen.columns = ['shamisen_perf_id','performer_id']
performer_as_shamisen.drop_duplicates(inplace=True)

performers = merge_id_array(performers,performer_as_shamisen,'performer_id','shamisen_perf_id')

spucks_performances = pscenes[['spuck_id','performance_id']].dropna(how='any').rename(columns={'performance_id':'puppeteer_perf_id'})
spucks_performances = spucks_performances.groupby('puppeteer_perf_id').spuck_id.apply(lambda x: pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis = 1)
spucks_performances.columns = ['puppeteer_perf_id','spuck_id']
spucks_performances.drop_duplicates(inplace=True)

xtra_spucks = spucks[['spuck_id','pscene_id','puppeteer_id','kashira_id']].rename(columns={'puppeteer_id':'performer_id'})
xtra_spucks = xtra_spucks.merge(spucks_performances, on='spuck_id', how='left').drop('spuck_id',1).drop('pscene_id',1)

performer_as_puppeteer = xtra_spucks[['performer_id','puppeteer_perf_id']].dropna(how='any').drop_duplicates()
performers = merge_id_array(performers,performer_as_puppeteer,'performer_id','puppeteer_perf_id')

performer_puppets = xtra_spucks[['performer_id','kashira_id']].dropna(how='any').drop_duplicates()
performers = merge_id_array(performers,performer_puppets,'performer_id','kashira_id')

# Clean-up IDs and export to CSV and JSON

<hr/>

In [15]:
dataframes = ["authors","characters","creators","images","kashira","performances","performers","plays","productions","pscenes","spucks","tags"]

for df in dataframes:
    exec("re_id(" + df + ",'" + df +"_id')" ) # fix ids
    exec("csv_out("+ df + ",'" + df + ".csv')") # output as csv files
    exec("json_out("+ df + ",'" + df + ".json')") # output as json files