In [1]:
import pandas as pd

In [2]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

characters = pd.read_csv('in/characters.csv').astype(str)
char_plays = pd.read_csv('in/characters_plays.csv').astype(str)
char_imgs = pd.read_csv('in/characters_images.csv').astype(str)

In [3]:
characters.head()

Unnamed: 0,character_id,character_code,label_eng,label_ja,label_ka,authority_control,sort_ja
0,1,Akai-01,Farmer,Oyaji,おやじ,August 1989 Program,おやじ
1,2,Akai-02,Farmer's wife,Nyōbō,女房,August 1989 Program,にょうぼう
2,3,Akai-03,"Magotaro, a horse","Magotarō, Uma",孫太郎（馬）,August 1989 Program,まごたろう（うま）
3,4,Akai-04,Local governor,Odaikan,お代官,August 1989 Program,おだいかん
4,5,Akai-05,Local governor's assistant,Odaikan no kobun,お代官のこぶん,August 1989 Program,おだいかんのこぶん


In [4]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in id_list:
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [5]:
# Left join to add play_ids to characters dataframe
characters = pd.merge(characters, char_plays, on='character_id', how='left')

# combine multiple plays per character into one record per character
character = multiValDataFrame(characters, 'character_id', 'play_id')

In [6]:
# Left join to add image_ids to characters dataframe
characters = pd.merge(characters, char_imgs, on='character_id', how='left')

# combine multiple plays per character into one record per character
character = multiValDataFrame(characters, 'character_id', 'image_id')

In [7]:
characters = characters.rename(columns = {'character_id':'id'})

In [8]:
# save final dataframe to csv file
characters.to_csv('characters_exp.csv', encoding='utf8', index=False)

In [9]:
# save final dataframe to json file
characters.to_json('characters_exp.json', orient='records', force_ascii=False)

In [10]:
characters.head()

Unnamed: 0,id,character_code,label_eng,label_ja,label_ka,authority_control,sort_ja,play_id,image_id
0,1,Akai-01,Farmer,Oyaji,おやじ,August 1989 Program,おやじ,162,56170.0
1,2,Akai-02,Farmer's wife,Nyōbō,女房,August 1989 Program,にょうぼう,162,
2,3,Akai-03,"Magotaro, a horse","Magotarō, Uma",孫太郎（馬）,August 1989 Program,まごたろう（うま）,162,
3,4,Akai-04,Local governor,Odaikan,お代官,August 1989 Program,おだいかん,162,
4,5,Akai-05,Local governor's assistant,Odaikan no kobun,お代官のこぶん,August 1989 Program,おだいかんのこぶん,162,
