In [1]:
import pandas as pd

In [2]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

performances = pd.read_csv('in/performances.csv').astype(str)

# get images
performances_images = pd.read_csv('in/performances_images.csv').astype(str)

# get scenes
performance_scenes = pd.read_csv('in/scenesproductions.csv').astype(str)

# get characters
plays_characters = pd.read_csv('in/characters_plays.csv').astype(str)

In [3]:
# mild clean-up/renaming
performances =  performances.drop('data_id',1).drop('code',1).rename(columns = {'id':'performance_id'})

performances_images = performances_images.rename(columns = {'visual_id':'image_id'})

performance_scenes = performance_scenes.replace({r'\n': ''}, regex=True)
performance_scenes = performance_scenes.rename(columns = {'id':'performance_scene_id','perf_id':'performance_id'})
performance_scenes = performance_scenes.drop('play_id',1).drop('prod_id',1).drop('scene_id',1).drop('spuck_note',1).drop('tayu_shamisen_note',1)

In [4]:
performances.head()

Unnamed: 0,performance_id,production_id,play_id
0,1,142,63
1,2,1,86
2,3,1,29
3,4,1,47
4,5,1,18


In [5]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in id_list:
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [6]:
# merge and multival images
performances = pd.merge(performances, performances_images, on='performance_id', how='left')
performances = multiValDataFrame(performances, 'performance_id', 'image_id')

In [7]:
# merge and multival performance_scenes
performances = pd.merge(performances, performance_scenes, on='performance_id', how='left')
performances = multiValDataFrame(performances, 'performance_id', 'performance_scene_id')

In [8]:
# merge and multival characters from play_id
performances = pd.merge(performances, plays_characters, on='play_id', how='left')
performances = multiValDataFrame(performances, 'performance_id', 'character_id')

In [9]:
# mild clean-up / re-ordering
performances = performances[['performance_id','play_id','production_id','performance_scene_id','character_id','image_id']]

In [10]:
performances.head()

Unnamed: 0,performance_id,play_id,production_id,performance_scene_id,character_id,image_id
0,1,63,142,"[1438, 1439, 1440, 1441, 1442, 1443, 1444, 144...","[675, 676, 677, 678, 679, 680, 681, 682, 683, ...",[nan]
1,2,86,1,"[1, 2]",[nan],[nan]
2,3,29,1,"[3, 4]","[538, 539, 540, 541, 542, 543, 544, 545, 546, ...",[nan]
3,4,47,1,[5],"[1011, 1012, 1013, 1014]",[nan]
4,5,18,1,[6],"[175, 176, 177, 178, 179, 180, 181, 182, 183, ...",[nan]


In [14]:
# export to csv
performances.to_csv('performances_exp.csv', encoding='utf8', index=False)

In [12]:
# export to json
performances.to_json('performances_exp.json', orient='records', force_ascii=False)