In [1]:
import pandas as pd

In [2]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

performers = pd.read_csv('in/performers.csv', dtype=object)
performers_images = pd.read_csv('in/performers_images.csv', dtype=object)
imagesonline = pd.read_csv('in/imagesonline.csv', dtype=object)
shamisenplayers = pd.read_csv('in/sceneshamisens.csv', dtype=object)
musicians = pd.read_csv('in/scenekotokokyus.csv', dtype=object)
narrators = pd.read_csv('in/scenetayus.csv', dtype=object)
spucks = pd.read_csv('in/spucks.csv', dtype=object)
pscenes = pd.read_csv('in/pscenes.csv', dtype=object)

In [3]:
performers = performers.rename(columns = {'id':'performer_id','name_kanji':'name_ka','alt_name_kanji':'alt_name_ka'})
performers = performers[['performer_id','name_proper','alt_name','name_ka','alt_name_ka','specialty','dates','notes']]
performers.head()

Unnamed: 0,performer_id,name_proper,alt_name,name_ka,alt_name_ka,specialty,dates,notes
0,1,Takemoto Datejidayū (see also Takemoto Datetay...,Takemoto Datetayū V,竹本伊達路大夫,竹本伊達大夫 (五),Tayu,'1950-1988/04',"Bunraku meikan (1980, 1985, 1988, 1990, 1994)"
1,2,Takemoto Tsudayū IV,,竹本津大夫 (四),,Tayu,'1950-',[Living National Treasure]; Bunraku meikan (19...
2,3,Tsuruzawa Kanotarō,,鶴澤叶太郎,,Shamisen,'1913-',"Bunraku meikan (1980, 1985, 1988, 1990, 1994)"
3,4,Takezawa Danshichi (see also Takezawa Danjirō IV),Takezawa Danjirō IV,竹澤団七,竹澤団二郎 (四),Shamisen,'1981/04-',"Bunraku meikan (1980, 1985, 1988, 1990, 1994)"
4,5,Toyotake Matsukadayū,,豊竹松香大夫,,Tayu,'1959-',"Bunraku meikan (1980, 1985, 1988, 1990, 1994)"


In [4]:
# clean out non-online images
imagesonline = imagesonline.rename(columns = {'id':'image_id'})
performers_images = performers_images.rename(columns = {'visual_id':'image_id'})
imagesonline = imagesonline[['image_id']]
performers_images = pd.merge(performers_images, imagesonline, on='image_id', how='inner')

In [5]:
# make pscene > performance join table
pscenes = pscenes[['id','perf_id']]
pscenes = pscenes.rename(columns = {'id':'pscene_id'})

Unnamed: 0,pscene_id,perf_id
0,1,2
1,2,2
2,3,3
3,4,3
4,5,4


In [9]:
pscenes.drop_duplicates(keep='first', inplace=True)
pscenes.head()

Unnamed: 0,pscene_id,perf_id
0,1,2
1,2,2
2,3,3
3,4,3
4,5,4


In [10]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in id_list:
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [11]:
performers = pd.merge(performers, performers_images, on='performer_id', how='left')
performers = multiValDataFrame(performers, 'performer_id', 'image_id')

In [12]:
shamisenplayers = shamisenplayers[['sceneproduction_id','shamisen_id']]
shamisenplayers = shamisenplayers.rename(columns = {'shamisen_id':'performer_id','sceneproduction_id':'pscene_id'})
shamisenplayers = pd.merge(shamisenplayers, pscenes, on='pscene_id', how='left').drop('pscene_id',1)
shamisenplayers = shamisenplayers.rename(columns = {'perf_id':'shamisen_perf_id'})
performers = pd.merge(performers, shamisenplayers, on='performer_id', how='left')
performers = multiValDataFrame(performers, 'performer_id', 'shamisen_perf_id')

In [13]:
musicians = musicians[['sceneproduction_id','kotokokyu_id']]
musicians = musicians.rename(columns = {'kotokokyu_id':'performer_id','sceneproduction_id':'pscene_id'})
musicians = pd.merge(musicians, pscenes, on='pscene_id', how='left').drop('pscene_id',1)
musicians = musicians.rename(columns = {'perf_id':'musician_perf_id'})
performers = pd.merge(performers, musicians, on='performer_id', how='left')
performers = multiValDataFrame(performers, 'performer_id', 'musician_perf_id')

In [14]:
narrators = narrators[['sceneproduction_id','tayu_id']]
narrators = narrators.rename(columns = {'tayu_id':'performer_id','sceneproduction_id':'pscene_id'})
narrators = pd.merge(narrators, pscenes, on='pscene_id', how='left').drop('pscene_id',1)
narrators = narrators.rename(columns = {'perf_id':'narrator_perf_id'})
performers = pd.merge(performers, narrators, on='performer_id', how='left')
performers = multiValDataFrame(performers, 'performer_id', 'narrator_perf_id')

In [15]:
spucks = spucks[['pscene_id','puppeteer_id']]
spucks = spucks.rename(columns = {'puppeteer_id':'performer_id'})
spucks = pd.merge(spucks, pscenes, on='pscene_id', how='left').drop('pscene_id',1)
spucks = spucks.rename(columns = {'perf_id':'puppeteer_perf_id'})
performers = pd.merge(performers, spucks, on='performer_id', how='left')
performers = multiValDataFrame(performers, 'performer_id', 'puppeteer_perf_id')

In [16]:
len(performers.index)

184

In [17]:
performers = performers.rename(columns = {'name_proper':'name','performer_id':'id'})
performers = performers[['id','name','name_ka','alt_name','alt_name_ka','dates','notes','specialty','image_id','musician_perf_id','narrator_perf_id','puppeteer_perf_id','shamisen_perf_id']]


In [18]:
performers.head()

Unnamed: 0,id,name,name_ka,alt_name,alt_name_ka,dates,notes,specialty,image_id,musician_perf_id,narrator_perf_id,puppeteer_perf_id,shamisen_perf_id
0,1,Takemoto Datejidayū (see also Takemoto Datetay...,竹本伊達路大夫,Takemoto Datetayū V,竹本伊達大夫 (五),'1950-1988/04',"Bunraku meikan (1980, 1985, 1988, 1990, 1994)",Tayu,"[8469, 8470, 9055, 9056, 45321, 52421, 52552, ...",[nan],"[17, 10, 19, 21, 24, 24, 25, 27, 28, 31, 32, 3...",[nan],[nan]
1,2,Takemoto Tsudayū IV,竹本津大夫 (四),,,'1950-',[Living National Treasure]; Bunraku meikan (19...,Tayu,"[8588, 9054, 9055, 9056, 9616, 9617, 9618, 961...",[nan],"[16, 10, 18, 24, 25, 27, 27, 28, 28, 32, 37, 3...",[nan],[nan]
2,3,Tsuruzawa Kanotarō,鶴澤叶太郎,,,'1913-',"Bunraku meikan (1980, 1985, 1988, 1990, 1994)",Shamisen,"[45321, 45449, 52634, 52751, 52755, 52819, 528...",[nan],[nan],[nan],"[14, 11, 18, 24, 25, 27, 28, 33, 35, 37, 40, 4..."
3,4,Takezawa Danshichi (see also Takezawa Danjirō IV),竹澤団七,Takezawa Danjirō IV,竹澤団二郎 (四),'1981/04-',"Bunraku meikan (1980, 1985, 1988, 1990, 1994)",Shamisen,"[12477, 12478, 12489]",[nan],[nan],[nan],"[227, 385, 389, 394, 399, 403, 407, 409, 415, ..."
4,5,Toyotake Matsukadayū,豊竹松香大夫,,,'1959-',"Bunraku meikan (1980, 1985, 1988, 1990, 1994)",Tayu,"[8337, 9715, 11874, 11875, 45449, 45681, 45691...",[nan],"[12, 14, 11, 18, 19, 22, 24, 24, 25, 27, 28, 3...",[nan],[270]


In [None]:
# performers.to_csv('performers_exp.csv', encoding='utf8', index=False)
# performers.to_json('performers_exp.json', orient="records", force_ascii=False)