In [1]:
import pandas as pd
import numpy as np

In [2]:
# read csvs as data frames
# cast as string to avoid float/NaN errors

tags = pd.read_csv('in/tags.csv', dtype=object)
tags_imgs = pd.read_csv('in/tags_images.csv', dtype=object)
imagesonline = pd.read_csv('in/imagesonline.csv', dtype=object)

In [3]:
# mild cleanup to remove newlines
tags['notes'] = tags['notes'].replace({r'\n': np.nan}, regex=True)
tags = tags.replace({'nan': np.nan})
imagesonline = imagesonline.rename(columns = {'id':'image_id'})

In [4]:
tags.head()

Unnamed: 0,tag_id,label,label_ka,description,notes,sort_ja
0,1,Asahi Theater,朝日座,,,あさひざ
1,2,Butai geta [high wooden clogs],舞台下駄,High wooden clogs worn on stage by head puppet...,,ぶたいげた
2,3,Dō [body],胴,"Body, torso, or framework of puppets.",,どう
3,4,Dressing room,楽屋,,,がくや
4,5,Geza [room for offstage musicians],下座あるいは囃子部屋,Small room over stage-right entrance occupied ...,,げざ


In [5]:
# clean out non-online images
imagesonline = imagesonline[['image_id']]
print 'online: ' + str(len(imagesonline.index)) + ' / in join: ' + str(len(tags_imgs.index)) + '\n'
tags_imgs = pd.merge(tags_imgs, imagesonline, on='image_id', how='inner')
print 'online in join: ' + str(len(tags_imgs.index))

online: 14636 / in join: 14918

online in join: 8959


In [6]:
tags_imgs.head()

Unnamed: 0,tag_id,image_id
0,43,8281
1,73,8281
2,43,8282
3,73,8282
4,43,8283


In [7]:
# multiValDataFrame takes a dataframe df, column name df_index and column name multival_key
# it merges records with the same df_index and updates multival_index to an array of all the multival_index values.
# E.g for a data frame plays with multiple character_id values per play_id, returns a data frame with one record 
# per play_id, and one array of character_ids per play.

def multiValDataFrame(df, df_index, multival_key):
    new_df = pd.DataFrame({})
    cols = df.columns
    id_list = df[df_index].unique()
    for i in id_list:
        temp_row = df.loc[df[df_index] == i ]
        vals = []
        for col in cols:
            vals.append(temp_row[:1][col].values[0])

        temp_dict = dict(zip(cols, vals))
        multi_id_list = list(temp_row[multival_key])
    
        temp_dict[multival_key] = multi_id_list
    
        new_df = new_df.append(temp_dict, ignore_index=True)
    return new_df

In [8]:
# inner join to add image_ids to author dataframe (bc we dont want tags without images)
tags = pd.merge(tags, tags_imgs, on='tag_id', how='inner')

# combine multiple images per tag into one record per tag
tags = multiValDataFrame(tags, 'tag_id', 'image_id')

In [9]:
tags = tags.rename(columns = {'tag_id':'id','label':'label_eng'})
tags = tags[['id','label_eng','label_ka','description','notes','image_id']]
tags.head()

Unnamed: 0,id,label_eng,label_ka,description,notes,image_id
0,1,Asahi Theater,朝日座,,,"[44990, 45057, 45216, 46025, 56319, 56320, 563..."
1,2,Butai geta [high wooden clogs],舞台下駄,High wooden clogs worn on stage by head puppet...,,"[44981, 44986, 45010, 45017, 45024, 45050, 450..."
2,3,Dō [body],胴,"Body, torso, or framework of puppets.",,"[44978, 45071, 45235, 45244, 45246, 46033, 461..."
3,4,Dressing room,楽屋,,,"[44990, 44992, 44993, 44997, 44998, 45000, 450..."
4,5,Geza [room for offstage musicians],下座あるいは囃子部屋,Small room over stage-right entrance occupied ...,,"[11630, 11633, 44982, 44997, 45004, 45005, 452..."


In [10]:
len(tags.index)

61

In [11]:
# export to csv
tags.to_csv('tags_exp.csv', encoding='utf8', index=False)

In [12]:
# export to json
tags.to_json('tags_exp.json', orient='records', force_ascii=False)