In [1]:
import pandas as pd

In [7]:
data = pd.read_csv('data/artwork_sample.csv')
data.head(3)
data.dtypes

id                      int64
accession_number       object
artist                 object
artistRole             object
artistId                int64
title                  object
dateText               object
medium                 object
creditLine             object
year                  float64
acquisitionYear         int64
dimensions             object
width                   int64
height                  int64
depth                 float64
units                  object
inscription           float64
thumbnailCopyright    float64
thumbnailUrl           object
url                    object
dtype: object

If there are bad values in some of the rows, read_csv has to set the data type to `object` (i.e. string).

Check the data types to see if they make sense. If not, e.g. if a year column is cast as `object` then there are likely missing or incorrect values.

In [11]:
data.acquisitionYear = data.acquisitionYear.astype(float)

In [12]:
data.acquisitionYear.dtype

dtype('float64')

In [14]:
# now working with full dataset
fulldf = pd.read_csv('data/artwork_data.csv', low_memory=False)
fulldf.head(3)
fulldf.dtypes

id                      int64
accession_number       object
artist                 object
artistRole             object
artistId                int64
title                  object
dateText               object
medium                 object
creditLine             object
year                   object
acquisitionYear       float64
dimensions             object
width                  object
height                 object
depth                 float64
units                  object
inscription            object
thumbnailCopyright     object
thumbnailUrl           object
url                    object
dtype: object

In [15]:
fulldf.height.astype(float)

ValueError: could not convert string to float: 'mm'

In [16]:
pd.to_numeric(fulldf.height)

ValueError: Unable to parse string "mm" at position 41339

In [17]:
pd.to_numeric(fulldf.height, errors='coerce')

0         419.0
1         213.0
2         467.0
3         394.0
4         335.0
          ...  
69196     305.0
69197     305.0
69198    2410.0
69199       NaN
69200     660.0
Name: height, Length: 69201, dtype: float64

In [18]:
fulldf.height.dtype
# still an object bc command above is "temporary"

dtype('O')

In [20]:
fulldf.height = pd.to_numeric(fulldf.height, errors='coerce')
fulldf.height.dtype

dtype('float64')

# Aggregating data

In [22]:
data.year

0       NaN
1       NaN
2    1785.0
3       NaN
4    1826.0
5    1826.0
6    1826.0
7    1826.0
8    1826.0
9    1826.0
Name: year, dtype: float64

In [23]:
# can get summary stats per column
data.year.min()

1785.0

In [24]:
data.year.max()

1826.0

In [25]:
data.year.sum()

12741.0

In [26]:
data.year.mean()

1820.142857142857

In [28]:
# standard deviation
data.year.std()

15.496543393378316

In [31]:
# another way to do get summary stats
data.agg(['min','max','mean','std'])

Unnamed: 0,id,accession_number,artist,artistRole,artistId,title,dateText,medium,creditLine,year,acquisitionYear,dimensions,width,height,depth,units,inscription,thumbnailCopyright,thumbnailUrl,url
min,1035.0,A00001,"Blake, Robert",artist,38.0,A Figure Bowing before a Seated Old Man with h...,"1826–7, reprinted 1892",Graphite on paper,Presented by Mrs John Richmond 1922,1785.0,1919.0,image: 240 x 338 mm,240.0,213.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-a-fi...
max,1044.0,A00010,"Blake, William",artist,39.0,"Two Drawings of Frightened Figures, Probably f...",date not known,"Watercolour, ink, chalk and graphite on paper....",Purchased with the assistance of a special gra...,1826.0,1922.0,support: 394 x 419 mm,394.0,467.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-two-...
mean,1039.5,,,,38.6,,,,,1820.142857,1920.2,,282.1,351.5,,,,,,
std,3.02765,,,,0.516398,,,,,15.496543,1.549193,,55.562677,66.818577,,,,,,


In [33]:
# probably better to do it this way though
data.describe()

Unnamed: 0,id,artistId,year,acquisitionYear,width,height,depth,inscription,thumbnailCopyright
count,10.0,10.0,7.0,10.0,10.0,10.0,0.0,0.0,0.0
mean,1039.5,38.6,1820.142857,1920.2,282.1,351.5,,,
std,3.02765,0.516398,15.496543,1.549193,55.562677,66.818577,,,
min,1035.0,38.0,1785.0,1919.0,240.0,213.0,,,
25%,1037.25,38.0,1826.0,1919.0,242.25,335.0,,,
50%,1039.5,39.0,1826.0,1919.0,244.5,339.0,,,
75%,1041.75,39.0,1826.0,1922.0,316.25,380.5,,,
max,1044.0,39.0,1826.0,1922.0,394.0,467.0,,,


# Normalising numeric data

Normalization means that we want to adjust the values in the column and change their scale

In [34]:
data.height.head()

0    419
1    213
2    467
3    394
4    335
Name: height, dtype: int64

In [36]:
data.height.describe()

count     10.000000
mean     351.500000
std       66.818577
min      213.000000
25%      335.000000
50%      339.000000
75%      380.500000
max      467.000000
Name: height, dtype: float64

In [37]:
height = data.height

In [39]:
# standardisation
norm = (height - height.mean())/height.std()
norm

0    1.010198
1   -2.072777
2    1.728561
3    0.636051
4   -0.246937
5   -0.202040
6   -0.261903
7   -0.172108
8   -0.246937
9   -0.172108
Name: height, dtype: float64

In [40]:
# normalise all values bx 0 and 1
minmax = (height - height.min())/(height.max()-height.min())
minmax

0    0.811024
1    0.000000
2    1.000000
3    0.712598
4    0.480315
5    0.492126
6    0.476378
7    0.500000
8    0.480315
9    0.500000
Name: height, dtype: float64

In [42]:
data['standardised_height'] = norm
# dot notation can't create new columns

# Transforming data
e.g. changing height values from mm to cm

In [44]:
data.height.transform(lambda x: x)

0    419
1    213
2    467
3    394
4    335
5    338
6    334
7    340
8    335
9    340
Name: height, dtype: int64

In [47]:
data.height.transform(lambda x: x/10)
# .transform applies lambda fxn across whole series

0    41.9
1    21.3
2    46.7
3    39.4
4    33.5
5    33.8
6    33.4
7    34.0
8    33.5
9    34.0
Name: height, dtype: float64

In [48]:
# .transform is more powerful if combined with .groupby
data.groupby('artist') # .groupby creates a temporary dataframe

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002072F23E5C8>

In [49]:
data.groupby('artist').transform('nunique')

Unnamed: 0,id,accession_number,artistRole,artistId,title,dateText,medium,creditLine,year,acquisitionYear,dimensions,width,height,depth,units,inscription,thumbnailCopyright,thumbnailUrl,url,standardised_height
0,4,4,1,1,4,2,3,1,1,1,4,4,4,0,1,0,0,4,4,4
1,4,4,1,1,4,2,3,1,1,1,4,4,4,0,1,0,0,4,4,4
2,4,4,1,1,4,2,3,1,1,1,4,4,4,0,1,0,0,4,4,4
3,4,4,1,1,4,2,3,1,1,1,4,4,4,0,1,0,0,4,4,4
4,6,6,1,1,6,1,1,1,1,1,6,5,4,0,1,0,0,6,6,4
5,6,6,1,1,6,1,1,1,1,1,6,5,4,0,1,0,0,6,6,4
6,6,6,1,1,6,1,1,1,1,1,6,5,4,0,1,0,0,6,6,4
7,6,6,1,1,6,1,1,1,1,1,6,5,4,0,1,0,0,6,6,4
8,6,6,1,1,6,1,1,1,1,1,6,5,4,0,1,0,0,6,6,4
9,6,6,1,1,6,1,1,1,1,1,6,5,4,0,1,0,0,6,6,4


In [50]:
data.groupby('artist')['height']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002072F250A88>

In [51]:
data.groupby('artist')['height'].transform('mean')

0    373.25
1    373.25
2    373.25
3    373.25
4    337.00
5    337.00
6    337.00
7    337.00
8    337.00
9    337.00
Name: height, dtype: float64

In [52]:
data.artist

0     Blake, Robert
1     Blake, Robert
2     Blake, Robert
3     Blake, Robert
4    Blake, William
5    Blake, William
6    Blake, William
7    Blake, William
8    Blake, William
9    Blake, William
Name: artist, dtype: object

The mean height of artwork by Robert Blake is 373.25mm, while for William Blake it is 337.00mm.

# Filtering data
(working with fulldf here)

In [53]:
fulldf.filter(items=['id','artist'])

Unnamed: 0,id,artist
0,1035,"Blake, Robert"
1,1036,"Blake, Robert"
2,1037,"Blake, Robert"
3,1038,"Blake, Robert"
4,1039,"Blake, William"
...,...,...
69196,122960,"P-Orridge, Genesis"
69197,122961,"P-Orridge, Genesis"
69198,121181,"Hatoum, Mona"
69199,112306,"Creed, Martin"


In [54]:
fulldf.filter(like='year')

Unnamed: 0,year
0,
1,
2,1785
3,
4,1826
...,...
69196,1975
69197,1976
69198,1996
69199,2000


But in original dataset, there is also `acquisitionYear`. This wasn't returned bc `like` param is case sensitive.

In [55]:
# to also return acquisitionYear, use regex
fulldf.filter(regex='(?i)year')

Unnamed: 0,year,acquisitionYear
0,,1922.0
1,,1922.0
2,1785,1922.0
3,,1922.0
4,1826,1919.0
...,...,...
69196,1975,2013.0
69197,1976,2013.0
69198,1996,2013.0
69199,2000,2013.0


In [57]:
# .filter also works on rows (axis=0)
fulldf.filter(axis=0, regex='^100.$') # return rows 1000-1009

Unnamed: 0,id,accession_number,artist,artistRole,artistId,title,dateText,medium,creditLine,year,acquisitionYear,dimensions,width,height,depth,units,inscription,thumbnailCopyright,thumbnailUrl,url
1000,14704,A01004,"Turner, Joseph Mallord William",artist,558,Hedging and Ditching,1812,Etching and engraving on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 184 x 259 mm,184,259.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-hed...
1001,14705,A01005,"Turner, Joseph Mallord William",artist,558,"Hedging and Ditching, engraved by J.C. Easling",1812,Etching and mezzotint on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 186 x 262 mm,186,262.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-hed...
1002,14706,A01006,"Turner, Joseph Mallord William",artist,558,River Wye,1812,Print on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 183 x 264 mm,183,264.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-riv...
1003,14707,A01007,"Turner, Joseph Mallord William",artist,558,"River Wye, engraved by W. Annis",1812,Etching and mezzotint on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 184 x 265 mm,184,265.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-riv...
1004,14708,A01008,"Turner, Joseph Mallord William",artist,558,Chain of Alps from Grenoble to Chamberi,1812,Etching and engraving on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 178 x 259 mm,178,259.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-cha...
1005,14709,A01009,"Turner, Joseph Mallord William",artist,558,"Chain of Alps from Grenoble to Chamberi, engra...",1812,Etching and mezzotint on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 178 x 261 mm,178,261.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-cha...
1006,14710,A01010,"Turner, Joseph Mallord William",artist,558,Mer de Glace,1812,Etching and engraving on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 178 x 253 mm,178,253.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-mer...
1007,14711,A01011,"Turner, Joseph Mallord William",artist,558,Mer de Glace,1812,Etching and mezzotint on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 179 x 257 mm,179,257.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-mer...
1008,14712,A01012,"Turner, Joseph Mallord William",artist,558,"Rivaux Abbey, Yorkshire",1812,Etching and engraving on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 182 x 265 mm,182,265.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-riv...
1009,14713,A01013,"Turner, Joseph Mallord William",artist,558,"Rivaux Abbey, engraved by Henry Dawe",1812,Etching and mezzotint on paper,Presented by A. Acland Allen through the Art F...,1812,1925.0,image: 183 x 266 mm,183,266.0,,mm,,,http://www.tate.org.uk/art/images/work/A/A01/A...,http://www.tate.org.uk/art/artworks/turner-riv...


# Review
Useful methods covered

#### Investigating
* df.dtypes
* df.year = df.year.astype(float)
* pd.to_numeric(df.height,errors='coerce')
#### Aggregating
* df.year.min()
* df.agg(['min', 'max'])
* df.describe()
#### Normalising
* (col - col.mean())/col.std()
* (col - col.min())/(col.max()-col.min())
#### Transforming
* df.height.transform(lambda x:~)
* df.groupby('artist').transform()
#### Filtering
* df.filter(items=['id', 'artist'])
* df.filter(regex='(?i)year')
* df.filter(axis=0, like='100', case=False)

#### Don't forget to assign values back to dataframe as a new column!