In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('../cleaning-data-python-data-playbook/artwork_data.csv', low_memory=False)
df.head()

Unnamed: 0,id,accession_number,artist,artistRole,artistId,title,dateText,medium,creditLine,year,acquisitionYear,dimensions,width,height,depth,units,inscription,thumbnailCopyright,thumbnailUrl,url
0,1035,A00001,"Blake, Robert",artist,38,A Figure Bowing before a Seated Old Man with h...,date not known,"Watercolour, ink, chalk and graphite on paper....",Presented by Mrs John Richmond 1922,,1922.0,support: 394 x 419 mm,394,419,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-a-fi...
1,1036,A00002,"Blake, Robert",artist,38,"Two Drawings of Frightened Figures, Probably f...",date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922.0,support: 311 x 213 mm,311,213,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-two-...
2,1037,A00003,"Blake, Robert",artist,38,The Preaching of Warning. Verso: An Old Man En...,?c.1785,Graphite on paper. Verso: graphite on paper,Presented by Mrs John Richmond 1922,1785.0,1922.0,support: 343 x 467 mm,343,467,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...
3,1038,A00004,"Blake, Robert",artist,38,Six Drawings of Figures with Outstretched Arms,date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922.0,support: 318 x 394 mm,318,394,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-six-...
4,1039,A00005,"Blake, William",artist,39,The Circle of the Lustful: Francesca da Rimini...,"1826–7, reprinted 1892",Line engraving on paper,Purchased with the assistance of a special gra...,1826.0,1919.0,image: 243 x 335 mm,243,335,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...


### Use dataframe 'replace' method

In [8]:
#  Look for bad data - for instance, 'date Text' column contains rows with 'date not known' - should make into NaN
pd.isna(df.loc[:,'dateText']).value_counts()  # nothing that resolves to NaN

False    69201
Name: dateText, dtype: int64

In [10]:
from numpy import nan

In [15]:
# use dataframe replace function, provide dictionary that specified column and mappings for that column
df.replace({ 'dateText': { 'date not known': nan}}, inplace=True)

In [16]:
pd.isna(df.loc[:,'dateText']).value_counts()

False    63227
True      5974
Name: dateText, dtype: int64

### Use .loc and directly update column

In [52]:
df = pd.read_csv('../cleaning-data-python-data-playbook/artwork_data.csv', low_memory=False)
df.head()

Unnamed: 0,id,accession_number,artist,artistRole,artistId,title,dateText,medium,creditLine,year,acquisitionYear,dimensions,width,height,depth,units,inscription,thumbnailCopyright,thumbnailUrl,url
0,1035,A00001,"Blake, Robert",artist,38,A Figure Bowing before a Seated Old Man with h...,date not known,"Watercolour, ink, chalk and graphite on paper....",Presented by Mrs John Richmond 1922,,1922.0,support: 394 x 419 mm,394,419,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-a-fi...
1,1036,A00002,"Blake, Robert",artist,38,"Two Drawings of Frightened Figures, Probably f...",date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922.0,support: 311 x 213 mm,311,213,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-two-...
2,1037,A00003,"Blake, Robert",artist,38,The Preaching of Warning. Verso: An Old Man En...,?c.1785,Graphite on paper. Verso: graphite on paper,Presented by Mrs John Richmond 1922,1785.0,1922.0,support: 343 x 467 mm,343,467,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...
3,1038,A00004,"Blake, Robert",artist,38,Six Drawings of Figures with Outstretched Arms,date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922.0,support: 318 x 394 mm,318,394,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-six-...
4,1039,A00005,"Blake, William",artist,39,The Circle of the Lustful: Francesca da Rimini...,"1826–7, reprinted 1892",Line engraving on paper,Purchased with the assistance of a special gra...,1826.0,1919.0,image: 243 x 335 mm,243,335,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...


In [53]:
# can isolate the rows that match the criteria, then directly set the value of the target column
df.loc[df['dateText'] == 'date not known', ['dateText']] = nan

In [54]:
pd.isna(df.loc[:,'dateText']).value_counts()  # looks like it worked

False    63227
True      5974
Name: dateText, dtype: int64

### Multiple criteria using 'notnull' method

In [56]:
# find values in the year column that have values and are not numbers
bad_years = df.loc[(df['year'].notnull()) & (df['year'].astype(str).str.contains('[^0-9]', regex=True)), ['year']]
bad_years['year'].unique()

array(['no date', 'c.1997-9'], dtype=object)

In [57]:
# replace bad years with nan
df.loc[(df['year'].notnull()) & (df['year'].astype(str).str.contains('[^0-9]', regex=True)), ['year']] = nan

In [58]:
# check again for bad data
df.loc[(df['year'].notnull()) & (df['year'].astype(str).str.contains('[^0-9]', regex=True)), ['year']]


Unnamed: 0,year
