In [1]:
import pandas as pd

In [2]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

performances = pd.read_csv('in/performances.csv', dtype=object)
performances_images = pd.read_csv('in/performances_images.csv', dtype=object)
performance_scenes = pd.read_csv('in/scenesproductions.csv', dtype=object)
plays_characters = pd.read_csv('in/characters_plays.csv', dtype=object)
productions = pd.read_csv('in/productions.csv', dtype=object)
plays = pd.read_csv('in/plays.csv', dtype=object)
imagesonline = pd.read_csv('in/imagesonline.csv', dtype=object)

In [3]:
# mild clean-up/renaming
performances =  performances.drop('data_id',1).drop('code',1).rename(columns = {'id':'performance_id'})

performances_images = performances_images.rename(columns = {'visual_id':'image_id'})

performance_scenes = performance_scenes.replace({r'\n': ''}, regex=True)
performance_scenes = performance_scenes.rename(columns = {'id':'performance_scene_id','perf_id':'performance_id'})
performance_scenes = performance_scenes.drop('play_id',1).drop('prod_id',1).drop('scene_id',1).drop('spuck_note',1).drop('tayu_shamisen_note',1)

plays = plays[['id','label_eng']]
plays = plays.rename(columns = {'id':'play_id','label_eng':'play_label'})
                     
productions = productions[['id','label','seq_chron']]
productions = productions.rename(columns = {'id':'production_id','label':'prod_label'})

imagesonline = imagesonline.rename(columns = {'id':'image_id'})

In [4]:
performances_images.head()

Unnamed: 0,performance_id,image_id
0,310,8281
1,310,8282
2,310,8283
3,310,8284
4,310,8285


In [5]:
# clean out non-online images
imagesonline = imagesonline[['image_id']]
performances_images = pd.merge(performances_images, imagesonline, on='image_id', how='inner')

In [7]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in id_list:
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [8]:
# merge and multival images
performances = pd.merge(performances, performances_images, on='performance_id', how='left')
performances = multiValDataFrame(performances, 'performance_id', 'image_id')

In [9]:
# merge and multival performance_scenes
performances = pd.merge(performances, performance_scenes, on='performance_id', how='left')
performances = multiValDataFrame(performances, 'performance_id', 'performance_scene_id')

In [10]:
# merge and multival characters from play_id
performances = pd.merge(performances, plays_characters, on='play_id', how='left')
performances = multiValDataFrame(performances, 'performance_id', 'character_id')

In [11]:
# merge labels from productions and plays into one performance label
performances = pd.merge(performances, plays, on='play_id', how='left')
performances = pd.merge(performances, productions, on='production_id', how='left')
performances.head()

Unnamed: 0,character_id,image_id,performance_id,performance_scene_id,play_id,production_id,play_label,prod_label,seq_chron
0,"[675, 676, 677, 678, 679, 680, 681, 682, 683, ...",[nan],1,"[1438, 1439, 1440, 1441, 1442, 1443, 1444, 144...",63,142,The League of the 47 Ronin,January 1985,150
1,[nan],[nan],2,"[1, 2]",86,1,More Worthy than Revenge,November 1964,1
2,"[538, 539, 540, 541, 542, 543, 544, 545, 546, ...",[nan],3,"[3, 4]",29,1,The Vendetta in Iga,November 1964,1
3,"[1011, 1012, 1013, 1014]",[nan],4,[5],47,1,The Maple Viewing Party,November 1964,1
4,"[175, 176, 177, 178, 179, 180, 181, 182, 183, ...",[nan],5,[6],18,1,The Two Butterflies,November 1964,1


In [12]:
performances['play_label'] = performances['prod_label'].str.cat(performances['play_label'], sep=': ')
performances = performances.rename(columns = {'play_label':'label_eng'}).drop('prod_label',1)

In [13]:
# mild clean-up / re-ordering
performances = performances.rename(columns = {'performance_id':'id','performance_scene_id':'pscene_id'})
performances = performances[['id','play_id','production_id','pscene_id','character_id','image_id','label_eng','seq_chron']]

In [14]:
performances.head()

Unnamed: 0,id,play_id,production_id,pscene_id,character_id,image_id,label_eng,seq_chron
0,1,63,142,"[1438, 1439, 1440, 1441, 1442, 1443, 1444, 144...","[675, 676, 677, 678, 679, 680, 681, 682, 683, ...",[nan],January 1985: The League of the 47 Ronin,150
1,2,86,1,"[1, 2]",[nan],[nan],November 1964: More Worthy than Revenge,1
2,3,29,1,"[3, 4]","[538, 539, 540, 541, 542, 543, 544, 545, 546, ...",[nan],November 1964: The Vendetta in Iga,1
3,4,47,1,[5],"[1011, 1012, 1013, 1014]",[nan],November 1964: The Maple Viewing Party,1
4,5,18,1,[6],"[175, 176, 177, 178, 179, 180, 181, 182, 183, ...",[nan],November 1964: The Two Butterflies,1


In [15]:
# export to csv
performances.to_csv('performances_exp.csv', encoding='utf8', index=False)

In [16]:
# export to json
performances.to_json('performances_exp.json', orient='records', force_ascii=False)