In [2]:
import pandas as pd
import os

In [3]:
df = pd.read_pickle(os.path.join('.', 'data_frame.pickle'))

In [4]:
df.artist

id
1035           Blake, Robert
1036           Blake, Robert
1037           Blake, Robert
1038           Blake, Robert
1039          Blake, William
                 ...        
122960    P-Orridge, Genesis
122961    P-Orridge, Genesis
121181          Hatoum, Mona
112306         Creed, Martin
127035     Brunias, Agostino
Name: artist, Length: 69201, dtype: object

In [5]:
artists = df['artist']
pd.unique(artists)
len(pd.unique(artists))

3336

In [6]:
s = df['artist'] == 'Bacon, Francis'
s.value_counts()

artist
False    69151
True        50
Name: count, dtype: int64

In [7]:
# Other way to do the above:
artist_counts = df['artist'].value_counts()
artist_counts['Bacon, Francis']

50

### Indexing done the right way...

## We will:
1. Practice using "loc" and "iloc"
2. Find the biggest artwork in the collection by size
3. Learn how to deal with common problems in data analysis

In [8]:
# Demo
df.loc[1035, 'artist']
df.iloc[0, 0]
df.iloc[0, :]
df.iloc[0:2, 0:2]

Unnamed: 0_level_0,artist,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1035,"Blake, Robert",A Figure Bowing before a Seated Old Man with h...
1036,"Blake, Robert","Two Drawings of Frightened Figures, Probably f..."


In [10]:
# Try multiplication
df['height'] * df['width']

TypeError: can't multiply sequence by non-int of type 'float'

In [14]:
df['width'].sort_values().head()

id
20822            (1):
105337    (diameter):
98671         (each):
76420         (each):
91391        (image):
Name: width, dtype: object

In [15]:
df['width'].sort_values().tail()

id
121283    NaN
117863    NaN
120549    NaN
122900    NaN
112306    NaN
Name: width, dtype: object

In [16]:
# Try to convert
pd.to_numeric(df['width'])

ValueError: Unable to parse string "(upper):" at position 1839

In [17]:
# Force NaNs
pd.to_numeric(df['width'], errors='coerce')

id
1035      394.0
1036      311.0
1037      343.0
1038      318.0
1039      243.0
          ...  
122960    305.0
122961    305.0
121181     45.0
112306      NaN
127035    508.0
Name: width, Length: 69201, dtype: float64

In [18]:
df.loc[:, 'width'] = pd.to_numeric(df['width'], errors = 'coerce')

In [19]:
pd.to_numeric(df['height'], errors = 'coerce')
df.loc[:, 'height'] = pd.to_numeric(df['height'], errors = 'coerce')

In [20]:
df['height'] * df['width']

id
1035      165086.0
1036       66243.0
1037      160181.0
1038      125292.0
1039       81405.0
            ...   
122960     93025.0
122961     93025.0
121181    108450.0
112306         NaN
127035    335280.0
Length: 69201, dtype: object

In [21]:
df['units'].value_counts()

units
mm    65860
Name: count, dtype: int64

In [22]:
# Assign - create new columns with size
area = df['height'] * df['width']
df = df.assign(area = area)

In [23]:
df['area'].max()

132462000.0

In [24]:
df['area'].idxmax()

98367

In [25]:
df.loc[df['area'].idxmax(), :]

artist                               Therrien, Robert
title                No Title (Table and Four Chairs)
medium             Aluminium, steel, wood and plastic
year                                           2003.0
acquisitionYear                                2008.0
width                                          8920.0
height                                        14850.0
units                                              mm
area                                      132462000.0
Name: 98367, dtype: object

In [None]:
"""KEYS_TO_USE = ['id', 'all_artists', 'title', 'medium', 
               'acquisitionYear', 'height', 
               'width', 'units']"""


In [None]:
""" This function takes the file path and keys 
and then forms and returns the tuple

"""
"""def get_record_from_file(file_path, keys_to_use):
    with open(file_path) as artwork_file:
        content = json.load(artwork_file)
        
    record = []
    for field in keys_to_use:
        record.append(content[field])

    return tuple(record)"""

In [None]:
""" This function traverses the directories with JSON files.
For first file in each directory call function for processing 
single file and go to the next directory.
"""

"""def read_artworks_from_json(keys_to_use):
    JSON_ROOT = os.path.join('.', 'artworks')
    
    artworks = []
    for root, _, files in os.walk(JSON_ROOT):
        for f in files:
            if f.endswith('json'):
                record = get_record_from_file(os.path.join(root, f),
                                             keys_to_use)
                artworks.append(record)
            break
            
    df = pd.DataFrame.from_records(artworks,
                                columns = keys_to_use,
                                index = 'id')
    
    return df"""

In [None]:
"""df = read_artworks_from_json(KEYS_TO_USE)
df"""