In [1]:
import pandas as pd

In [2]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

authors = pd.read_csv('in/authors.csv').astype(str)
play_auth = pd.read_csv('in/authors_plays.csv').astype(str)

In [3]:
authors.head()

Unnamed: 0,author_id,label,label_ka,dates,reference,sort_ja
0,1,Asada Icchō,浅田一鳥,fl. 1741-1767,LC Authorities,あさだいっちょう
1,2,Ashikawa Teruha,芦川照葉,,LC Authorities,あしかわてるは
2,4,Chikamatsu Hanji,近松半二,d. 1786 or 7,LC Authorities,ちかまつはんじ
3,5,Chikamatsu Kosuiken,近松湖水軒,,"繪本太功記 / 近松やなぎ, 近松湖水軒, 千葉軒合作, 東京 : 金櫻堂, 1890.12...",ちかまつこすいけん
4,6,Chikamatsu Monzaemon,近松門左衛門,,LC Authorities,ちかまつもんざえもん


In [4]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in id_list:
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [5]:
# left join to add play_ids to author dataframe
authors = pd.merge(authors, play_auth, on='author_id', how='left')

# combine multiple plays per author into one record per author
authors = multiValDataFrame(authors, 'author_id', 'play_id')

In [6]:
authors.head()

Unnamed: 0,author_id,dates,label,label_ka,play_id,reference,sort_ja
0,1,fl. 1741-1767,Asada Icchō,浅田一鳥,"[19, 72, 105, 122]",LC Authorities,あさだいっちょう
1,2,,Ashikawa Teruha,芦川照葉,[173],LC Authorities,あしかわてるは
2,4,d. 1786 or 7,Chikamatsu Hanji,近松半二,"[11, 21, 25, 27, 29, 33, 52, 79, 80, 90, 101, ...",LC Authorities,ちかまつはんじ
3,5,,Chikamatsu Kosuiken,近松湖水軒,[16],"繪本太功記 / 近松やなぎ, 近松湖水軒, 千葉軒合作, 東京 : 金櫻堂, 1890.12...",ちかまつこすいけん
4,6,,Chikamatsu Monzaemon,近松門左衛門,"[1, 2, 3, 4, 7, 8, 17, 24, 28, 34, 40, 41, 54,...",LC Authorities,ちかまつもんざえもん


In [7]:
# save final dataframe to csv file
authors.to_csv('authors_exp.csv', encoding='utf8', index=False)

In [8]:
# save final dataframe to json file
authors.to_json('authors_exp.json', orient="records", force_ascii=False)