In [1]:
import pandas as pd

In [2]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

productions = pd.read_csv('in/productions.csv').astype(str)

# get images
productions_images = pd.read_csv('in/productions_images.csv').astype(str)

# get performances
performances = pd.read_csv('in/performances.csv').astype(str)

In [3]:
# mild clean-up/renaming
productions = productions.drop('performance_num',1).drop('seq_chron',1)
productions = productions.rename(columns = {'label':'label_eng', 'id':'production_id','full_date':'dates'})

performances = performances.drop('data_id',1).drop('code',1)
performances = performances.rename(columns = {'id':'performance_id'})

In [4]:
productions.head()

Unnamed: 0,production_id,label_eng,dates,place
0,1,November 1964,1964/11/n.d.-1964/11/n.d.,not recorded
1,2,February 1968,1968/02/25-1968/03/10,National Theatre of Japan
2,3,October 1968,1968/10/27-1968/11/10,National Theatre of Japan
3,4,May 1969,1969/05/11-1969/05/25,National Theatre of Japan
4,5,September 1969,1969/09/14-1969/09/21,National Theatre of Japan


In [5]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in id_list:
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [7]:
# merge and multival images
productions = pd.merge(productions, productions_images, on='production_id', how='left')
productions = multiValDataFrame(productions, 'production_id', 'image_id')

In [9]:
# merge and multival images
productions = pd.merge(productions, performances, on='production_id', how='left')
productions = multiValDataFrame(productions, 'production_id', 'performance_id')

In [12]:
# mild clean-up / re-ordering
productions = productions.rename(columns = {'production_id':'id'})
productions = productions[['id','dates','place','label_eng','image_id','performance_id','play_id']]

In [13]:
productions.head()

Unnamed: 0,id,dates,place,label_eng,image_id,performance_id,play_id
0,1,1964/11/n.d.-1964/11/n.d.,not recorded,November 1964,[nan],"[2, 3, 4, 5, 6, 7, 8]",86
1,2,1968/02/25-1968/03/10,National Theatre of Japan,February 1968,[nan],"[12, 13, 14, 15, 16, 17]",83
2,3,1968/10/27-1968/11/10,National Theatre of Japan,October 1968,[nan],"[9, 10, 11]",30
3,4,1969/05/11-1969/05/25,National Theatre of Japan,May 1969,[nan],"[18, 19, 20]",72
4,5,1969/09/14-1969/09/21,National Theatre of Japan,September 1969,[nan],"[21, 22, 23]",90


In [14]:
# export to csv
productions.to_csv('productions_exp.csv', encoding='utf8', index=False)

In [15]:
# export to json
productions.to_json('productions_exp.json', orient='records', force_ascii=False)