In [1]:
import os
import pandas as pd
from pandas import read_csv


In [44]:
# Set the directory path and file name
base_dir = r'E:\CS777-Big Data Analytics'

# Harvard 
harvObj_dir = os.path.join(base_dir, 'Harvard', 'chinese ink')

# Metropolitan 
metObj_dir = os.path.join(base_dir, 'Metropolitan')

# metpainting_file = os.path.join(metObj_dir, 'painting' + '.txt')

# Smithsonian
smisoObj_dir = os.path.join(base_dir, 'SmithSonian')


# meta_file = os.path.join(metObj_dir, 'metadata' + '.csv')

try:   
    # Attempt to read the Harvard images and metadata file into a DataFrame
    harv_images = [image_nm for image_nm in os.listdir(harvObj_dir)
                    if image_nm.endswith("jpg")]
    print('Opened Harvard image files:', len(harv_images))

    harv_metadata_file = [file for file in os.listdir(harvObj_dir) if file.endswith("csv")][0]
    harv_df = read_csv(os.path.join(harvObj_dir, 
                                    harv_metadata_file), delimiter=",")
    print('Opened Harvard metadata file: ', harv_df.shape)

    # Attempt to read the Metropolitan images and metadata file into a DataFrame
    met_images_dir = os.path.join(metObj_dir, 'met_images')
    met_images = [image_nm for image_nm in os.listdir(met_images_dir)
                    if image_nm.endswith("jpg")]
    print('Opened Metropolitan image files:', len(met_images))

    met_df = read_csv(os.path.join(metObj_dir, 'painting.csv'), delimiter=",")
    print('Opened Metropolitan metadata file: ', met_df.shape)

    # Attempt to read the Smithsonian images and metadata file into a DataFrame
    smiso_images_dir = os.path.join(smisoObj_dir, 'images')
    smiso_images = [image_nm for image_nm in os.listdir(smiso_images_dir)
                    if image_nm.endswith("jpg")]
    print('Opened Smithsonian image files:', len(smiso_images))

    smiso_df = read_csv(os.path.join(smisoObj_dir, 
                                    'smithsonian_content.csv'), delimiter=",")
    print('Opened Smithsonian metadata file: ', smiso_df.shape)

# # Load 'metadata.csv'
# metadata_df = pd.read_csv(meta_file)
# print('Opened data file:', meta_file)
    
except Exception as e:
    # Print an error message if opening the file fails
    print(e)
    print('Failed to open data file')


Opened Harvard image files: 93
Opened Harvard metadata file:  (93, 62)
Opened Metropolitan image files: 990
Opened Metropolitan metadata file:  (1965, 54)
Opened Smithsonian image files: 567
Opened Smithsonian metadata file:  (609, 15)


Harvard

In [45]:
harv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 62 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   copyright                     1 non-null      object 
 1   contextualtextcount           93 non-null     int64  
 2   creditline                    93 non-null     object 
 3   accesslevel                   93 non-null     int64  
 4   dateoflastpageview            93 non-null     object 
 5   classificationid              93 non-null     int64  
 6   division                      93 non-null     object 
 7   markscount                    93 non-null     int64  
 8   publicationcount              93 non-null     int64  
 9   totaluniquepageviews          93 non-null     int64  
 10  contact                       93 non-null     object 
 11  colorcount                    93 non-null     int64  
 12  rank                          93 non-null     int64  
 13  id     

In [46]:
harv_cols = ["id", "title", "period"]
harv_df["caption"] = harv_df[["title", "period"]].apply(lambda x: f"{x.title} completed in {x.period}", axis=1)
harv_df["file_name"] = harv_df.id.apply(lambda x: f"{x}.jpg")

harv_metadata_df = harv_df[["file_name", "caption"]]

harv_metadata_df.head()

Unnamed: 0,file_name,caption
0,20971.jpg,Ink Landscape completed in nan
1,211205.jpg,Copy of a Chinese Landscape completed in Edo p...
2,189839.jpg,"Figure Painting completed in Qing dynasty, 164..."
3,55219.jpg,"Album of Paintings completed in Qing dynasty, ..."
4,340373.jpg,"Chinese Landscape completed in Edo period, 161..."


Metropolitan

In [47]:
pd.set_option('display.max_columns', None)

met_df.head(5)

Unnamed: 0,Object Number,Is Highlight,Is Timeline Work,Is Public Domain,Object ID,Gallery Number,Department,AccessionYear,Object Name,Title,Culture,Period,Dynasty,Reign,Portfolio,Constituent ID,Artist Role,Artist Prefix,Artist Display Name,Artist Display Bio,Artist Suffix,Artist Alpha Sort,Artist Nationality,Artist Begin Date,Artist End Date,Artist Gender,Artist ULAN URL,Artist Wikidata URL,Object Date,Object Begin Date,Object End Date,Medium,Dimensions,Credit Line,Geography Type,City,State,County,Country,Region,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL,Tags Wikidata URL
0,09.3,False,False,True,35968,,Asian Art,1909,Pictorial map,Forts Zeelandia and Provintia and the City of ...,China,,,,,3750,Artist,,Unidentified artist,"Chinese, active 19th century",,Unidentified artist,,,,,,,19th century,1800,1899,Wall hanging; ink and color on deerskin,Image: 59 1/4 × 80 3/4 in. (150.5 × 205.1 cm)\...,"Gift of J. Pierpont Morgan, 1909",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,https://www.wikidata.org/wiki/Q79003782,,"Metropolitan Museum of Art, New York, NY",Maps|Houses|Cities|Boats|Ships,http://vocab.getty.edu/page/aat/300028094|http...,https://www.wikidata.org/wiki/Q4006|https://ww...
1,12.37.135,False,False,False,35969,,Asian Art,1912,Hanging scroll,,China,Qing dynasty (1644–1911),,,,1214,Artist,,Jin Zunnian,"Chinese, active early 18th century",,Jin Zunnian,Chinese,1700,1800,,,,dated 1732,1732,1732,Hanging scroll; ink and color on silk,67 x 38 in. (170.2 x 96.5 cm),"Rogers Fund, 1912",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,,,"Metropolitan Museum of Art, New York, NY",,,
2,13.100.22,False,False,True,35970,,Asian Art,1913,Hanging scroll,Song of the Lute,China,late Ming dynasty (1368–1644),,,,1206,Artist,,Ding Yunpeng,"Chinese, 1547–ca. 1628",,Ding Yunpeng,Chinese,1547,1628,,http://vocab.getty.edu/page/ulan/500125747,https://www.wikidata.org/wiki/Q376079,1585,1585,1585,Hanging scroll; ink and color on paper,Image: 55 5/8 x 18 1/8 in. (141.3 x 46 cm),"John Stewart Kennedy Fund, 1913",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,https://www.wikidata.org/wiki/Q79003726,,"Metropolitan Museum of Art, New York, NY",Mountains|Trees|Boats,http://vocab.getty.edu/page/aat/300008795|http...,https://www.wikidata.org/wiki/Q8502|https://ww...
3,13.100.25,False,False,True,35971,,Asian Art,1913,Hanging scroll,Landscape after Li Cheng,China,Qing dynasty (1644–1911),,,,116363750,Artist|Artist,After|,Wang Hui|Unidentified artist,"Chinese, 1632–1717|",|,Wang Hui|Unidentified artist,Chinese|,1632 |,1717 |,|,http://vocab.getty.edu/page/ulan/500327902|,https://www.wikidata.org/wiki/Q716222|,"18th century or later, spurious date of 1680",1700,1911,Hanging scroll; ink on silk,14 1/2 x 11 1/8 in. (36.8 x 28.3 cm),"John Stewart Kennedy Fund, 1913",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,https://www.wikidata.org/wiki/Q79003601,,"Metropolitan Museum of Art, New York, NY",Mountains|Landscapes,http://vocab.getty.edu/page/aat/300008795|http...,https://www.wikidata.org/wiki/Q8502|https://ww...
4,13.100.40,False,False,True,35972,,Asian Art,1913,Folding fan mounted as an album leaf,Gathering Water Chestnuts,China,Qing dynasty (1644–1911),,,,116363750,Artist|Artist,After|,Wang Hui|Unidentified artist,"Chinese, 1632–1717|",|,Wang Hui|Unidentified artist,Chinese|,1632 |,1717 |,|,http://vocab.getty.edu/page/ulan/500327902|,https://www.wikidata.org/wiki/Q716222|,"18th century or later, spurious date of 1706",1700,1911,Folding fan mounted as an album leaf; ink and ...,6 3/4 x 20 in. (17.1 x 50.8 cm),"John Stewart Kennedy Fund, 1913",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,https://www.wikidata.org/wiki/Q79003460,,"Metropolitan Museum of Art, New York, NY",Boats|Houses|Landscapes,http://vocab.getty.edu/page/aat/300178749|http...,https://www.wikidata.org/wiki/Q35872|https://w...


In [48]:
filtered_met_df = met_df[met_df['Classification'].isin(['Painting', 'Paintings']) & 
                                      (met_df['Culture'] == 'China')]

filtered_met_df['file_name'] = 'image_' + filtered_met_df['Object ID'].astype(str) + '.jpg'

filtered_met_df = filtered_met_df[['file_name'] + [col for col in met_df.columns]]

filtered_met_df['file_name']

0        image_35968.jpg
1        image_35969.jpg
2        image_35970.jpg
3        image_35971.jpg
4        image_35972.jpg
              ...       
1960    image_854002.jpg
1961    image_854220.jpg
1962    image_855071.jpg
1963    image_857205.jpg
1964    image_860963.jpg
Name: file_name, Length: 1965, dtype: object

In [49]:
filtered_met_df['Title'].head(10)

0    Forts Zeelandia and Provintia and the City of ...
1                                                  NaN
2                                     Song of the Lute
3                             Landscape after Li Cheng
4                            Gathering Water Chestnuts
5                   Landscape in the Style of Huichong
6                                            Landscape
7                 Landscape in the Style of Yan Wengui
8                 Landscape in the manner of Dong Yuan
9                                            Landscape
Name: Title, dtype: object

In [50]:
cleaned_met_df = filtered_met_df.copy()

# Title
cleaned_met_df['Title'] = cleaned_met_df['Title'].str.split('|').str[0].fillna('')

# Period painting came from
cleaned_met_df['Period'] = cleaned_met_df['Period'].apply(lambda x: 'in the ' + 
                                                                x if not pd.isna(x) else '')
# Tags
temp = cleaned_met_df['Tags'].apply(lambda x: x if not pd.isna(x) else '').str.split('|')
temp = temp.apply(lambda x: [tag for tag in x])
temp = temp.apply(lambda x: ' and '.join(x) if len(x) > 1 else (x[0] if x else ''))
cleaned_met_df['Tags'] = temp

# Object Type
cleaned_met_df['Object Name'] = cleaned_met_df['Object Name'].apply(lambda x: 'in a ' + 
                                                                    x if not pd.isna(x) else '')

cleaned_met_df.head(5)

Unnamed: 0,file_name,Object Number,Is Highlight,Is Timeline Work,Is Public Domain,Object ID,Gallery Number,Department,AccessionYear,Object Name,Title,Culture,Period,Dynasty,Reign,Portfolio,Constituent ID,Artist Role,Artist Prefix,Artist Display Name,Artist Display Bio,Artist Suffix,Artist Alpha Sort,Artist Nationality,Artist Begin Date,Artist End Date,Artist Gender,Artist ULAN URL,Artist Wikidata URL,Object Date,Object Begin Date,Object End Date,Medium,Dimensions,Credit Line,Geography Type,City,State,County,Country,Region,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL,Tags Wikidata URL
0,image_35968.jpg,09.3,False,False,True,35968,,Asian Art,1909,in a Pictorial map,Forts Zeelandia and Provintia and the City of ...,China,,,,,3750,Artist,,Unidentified artist,"Chinese, active 19th century",,Unidentified artist,,,,,,,19th century,1800,1899,Wall hanging; ink and color on deerskin,Image: 59 1/4 × 80 3/4 in. (150.5 × 205.1 cm)\...,"Gift of J. Pierpont Morgan, 1909",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,https://www.wikidata.org/wiki/Q79003782,,"Metropolitan Museum of Art, New York, NY",Maps and Houses and Cities and Boats and Ships,http://vocab.getty.edu/page/aat/300028094|http...,https://www.wikidata.org/wiki/Q4006|https://ww...
1,image_35969.jpg,12.37.135,False,False,False,35969,,Asian Art,1912,in a Hanging scroll,,China,in the Qing dynasty (1644–1911),,,,1214,Artist,,Jin Zunnian,"Chinese, active early 18th century",,Jin Zunnian,Chinese,1700,1800,,,,dated 1732,1732,1732,Hanging scroll; ink and color on silk,67 x 38 in. (170.2 x 96.5 cm),"Rogers Fund, 1912",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,,,"Metropolitan Museum of Art, New York, NY",,,
2,image_35970.jpg,13.100.22,False,False,True,35970,,Asian Art,1913,in a Hanging scroll,Song of the Lute,China,in the late Ming dynasty (1368–1644),,,,1206,Artist,,Ding Yunpeng,"Chinese, 1547–ca. 1628",,Ding Yunpeng,Chinese,1547,1628,,http://vocab.getty.edu/page/ulan/500125747,https://www.wikidata.org/wiki/Q376079,1585,1585,1585,Hanging scroll; ink and color on paper,Image: 55 5/8 x 18 1/8 in. (141.3 x 46 cm),"John Stewart Kennedy Fund, 1913",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,https://www.wikidata.org/wiki/Q79003726,,"Metropolitan Museum of Art, New York, NY",Mountains and Trees and Boats,http://vocab.getty.edu/page/aat/300008795|http...,https://www.wikidata.org/wiki/Q8502|https://ww...
3,image_35971.jpg,13.100.25,False,False,True,35971,,Asian Art,1913,in a Hanging scroll,Landscape after Li Cheng,China,in the Qing dynasty (1644–1911),,,,116363750,Artist|Artist,After|,Wang Hui|Unidentified artist,"Chinese, 1632–1717|",|,Wang Hui|Unidentified artist,Chinese|,1632 |,1717 |,|,http://vocab.getty.edu/page/ulan/500327902|,https://www.wikidata.org/wiki/Q716222|,"18th century or later, spurious date of 1680",1700,1911,Hanging scroll; ink on silk,14 1/2 x 11 1/8 in. (36.8 x 28.3 cm),"John Stewart Kennedy Fund, 1913",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,https://www.wikidata.org/wiki/Q79003601,,"Metropolitan Museum of Art, New York, NY",Mountains and Landscapes,http://vocab.getty.edu/page/aat/300008795|http...,https://www.wikidata.org/wiki/Q8502|https://ww...
4,image_35972.jpg,13.100.40,False,False,True,35972,,Asian Art,1913,in a Folding fan mounted as an album leaf,Gathering Water Chestnuts,China,in the Qing dynasty (1644–1911),,,,116363750,Artist|Artist,After|,Wang Hui|Unidentified artist,"Chinese, 1632–1717|",|,Wang Hui|Unidentified artist,Chinese|,1632 |,1717 |,|,http://vocab.getty.edu/page/ulan/500327902|,https://www.wikidata.org/wiki/Q716222|,"18th century or later, spurious date of 1706",1700,1911,Folding fan mounted as an album leaf; ink and ...,6 3/4 x 20 in. (17.1 x 50.8 cm),"John Stewart Kennedy Fund, 1913",,,,,,,,,,,,Paintings,,http://www.metmuseum.org/art/collection/search...,https://www.wikidata.org/wiki/Q79003460,,"Metropolitan Museum of Art, New York, NY",Boats and Houses and Landscapes,http://vocab.getty.edu/page/aat/300178749|http...,https://www.wikidata.org/wiki/Q35872|https://w...


In [51]:
cleaned_met_df[cleaned_met_df['Title'] == '']['Title']


1        
25       
28       
35       
49       
       ..
1933     
1941     
1950     
1955     
1964     
Name: Title, Length: 613, dtype: object

In [52]:
cleaned_met_df['caption'] = cleaned_met_df['Tags'].where(cleaned_met_df['Tags'] != '', cleaned_met_df['Title'])


In [53]:
cleaned_met_df[(cleaned_met_df['caption'] == '')].count()


file_name                  343
Object Number              343
Is Highlight               343
Is Timeline Work           343
Is Public Domain           343
Object ID                  343
Gallery Number               0
Department                 343
AccessionYear              343
Object Name                343
Title                      343
Culture                    343
Period                     343
Dynasty                      0
Reign                        0
Portfolio                    0
Constituent ID             330
Artist Role                330
Artist Prefix              330
Artist Display Name        330
Artist Display Bio         330
Artist Suffix              330
Artist Alpha Sort          330
Artist Nationality         330
Artist Begin Date          330
Artist End Date            330
Artist Gender               38
Artist ULAN URL            202
Artist Wikidata URL        203
Object Date                246
Object Begin Date          343
Object End Date            343
Medium  

In [54]:
met_metadata_df = pd.DataFrame({
    'file_name': cleaned_met_df['file_name'],
    'caption': cleaned_met_df['caption'] 
    # + ' by ' + cleaned_paintings['Artist Display Name'] 
    # + cleaned_paintings['Object Name'] 
    # + cleaned_paintings['Period']
})

print(met_metadata_df.head(5))

         file_name                                         caption
0  image_35968.jpg  Maps and Houses and Cities and Boats and Ships
1  image_35969.jpg                                                
2  image_35970.jpg                   Mountains and Trees and Boats
3  image_35971.jpg                        Mountains and Landscapes
4  image_35972.jpg                 Boats and Houses and Landscapes


Smithsonian

In [55]:
smiso_df['id'] = smiso_df['id'].apply(lambda x: x + '.jpg' if not pd.isna(x) else '')

smiso_df['title'] = smiso_df['title'].apply(lambda x: x if not pd.isna(x) else '')

smiso_metadata_df = pd.DataFrame({
    'file_name': smiso_df['id'],
    'caption': smiso_df['title']
})

smiso_metadata_df.head(5)

Unnamed: 0,file_name,caption
0,ld1-1643390182193-1643390192549-1.jpg,Birds in Wintry Trees
1,ld1-1643390182193-1643390192552-0.jpg,Listening to the Pines in a Riverside Pavilion
2,ld1-1643390182193-1643390193858-0.jpg,Wind in the Courtyard Pines
3,ld1-1643390182193-1643390193584-0.jpg,Two Fighting Water Buffaloes
4,ld1-1643390182193-1643390191987-0.jpg,Portrait of Wang Huan


Match painting IDs in metadata with raw files in painting folder

In [64]:
# Filter out the indexes in metadata files that are not in the painting folders
harv_final_metadata = harv_metadata_df[harv_metadata_df['file_name'].isin(harv_images)]
metro_final_metadata = met_metadata_df[met_metadata_df['file_name'].isin(met_images)]
smiso_final_metadata = smiso_metadata_df[smiso_metadata_df['file_name'].isin(smiso_images)]

In [65]:
# Append the new DataFrame to metadata_df
combined_metadata_df = pd.concat([harv_final_metadata, metro_final_metadata, smiso_final_metadata], ignore_index=True)
combined_metadata_df.shape

(1650, 2)

In [66]:
# need manual labelling
combined_metadata_df[combined_metadata_df['caption']== '']

Unnamed: 0,file_name,caption
94,image_35969.jpg,
142,image_36032.jpg,
150,image_36045.jpg,
168,image_36067.jpg,
180,image_36092.jpg,
206,image_36144.jpg,
325,image_40068.jpg,
357,image_40334.jpg,
367,image_40425.jpg,
463,image_44710.jpg,


In [67]:
# Set the directory path and file name
new_metadata_file = os.path.join(base_dir, 'metadata' + '.csv')

# Exporting to CSV
combined_metadata_df.to_csv(new_metadata_file, index=False)

print("DataFrame exported to new_dataframe.csv")

DataFrame exported to new_dataframe.csv
