In [1]:
import pandas as pd

In [3]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

kashira = pd.read_csv('in/kashira.csv').astype(str)
kashira_images = pd.read_csv('in/kashira_images.csv').astype(str)

In [4]:
# mild cleaning/renaming
kashira = kashira.rename(columns = {'id':'kashira_id','name':'label','name_kanji':'label_ka','sortable_field_japanese':'sort_ja'})
kashira_images = kashira_images.rename(columns = {'visual_id':'image_id'})

In [5]:
kashira.head()

Unnamed: 0,kashira_id,label,label_ka,category,sort_ja
0,1,Amanjaku,あまんじゃく,Special,あまんじゃく
1,2,Ebisu,恵比須,Special,えびす
2,3,Ōshūto,大舅,Male,おおしゅうと
3,4,Ōdanshichi,大団七,Male,おおだんしち
4,5,Ochō no kodomo,お蝶の子供,Children,おちょうのこども


In [6]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in id_list:
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [7]:
# update kashira dataframe with left join of image_ids.
kashira = pd.merge(kashira, kashira_images, on='kashira_id', how='outer')

# combine combine multiple images per kashira into one row per kashira
kashira = multiValDataFrame(kashira, 'kashira_id', 'image_id')

In [8]:
kashira.head()

Unnamed: 0,category,image_id,kashira_id,label,label_ka,sort_ja
0,Special,[nan],1,Amanjaku,あまんじゃく,あまんじゃく
1,Special,[nan],2,Ebisu,恵比須,えびす
2,Male,"[8281, 8282, 8283, 8284, 8287, 8289, 8291, 829...",3,Ōshūto,大舅,おおしゅうと
3,Male,"[8667, 8669, 8670, 8671, 8672, 8676, 8687, 868...",4,Ōdanshichi,大団七,おおだんしち
4,Children,[nan],5,Ochō no kodomo,お蝶の子供,おちょうのこども


In [9]:
# mild cleaning/rearranging
kashira = kashira[['kashira_id','label','label_ka','category','image_id','sort_ja']]

In [10]:
# export to csv
kashira.to_csv('kashira_exp.csv', encoding='utf8', index=False)

In [11]:
# export to json
kashira.to_json('kashira_exp.json', orient='records', force_ascii=False)