# What we will do here:

1. Find the first ever artwork by acquisition date
2. First ever art by each artist
3. Handle Missing Values (empty or NaN) - fill with most common values or eliminate
4. Transformation of data
5. Dropping Groups (we will do this on title column values)
6. Filtering

In [1]:
# Iterating Over Groups


# for name, group in df.groupby('artist'):

In [2]:
import pandas as pd
import os

In [3]:
df = pd.read_pickle(os.path.join('.', 'data_frame.pickle'))

In [5]:
# Iterate

small_df = df.iloc[49980:50019, :].copy()

In [6]:
grouped = small_df.groupby('artist')

In [7]:
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [8]:
for name, group_df in grouped:
    print(name)
    print(group_df)
    break

Frost, Sir Terry
                artist            title               medium    year  \
id                                                                     
4704  Frost, Sir Terry        Blue Moon  Lithograph on paper  1952.0   
4705  Frost, Sir Terry      Boat Shapes     Linocut on paper  1952.0   
4706  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954.0   
4707  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954.0   
4708  Frost, Sir Terry            Leeds    Drypoint on paper  1956.0   
4709  Frost, Sir Terry  Camping, Anduze     Etching on paper  1979.0   
4710  Frost, Sir Terry     Umea, Sweden     Etching on paper  1979.0   
4711  Frost, Sir Terry    Self-Portrait     Etching on paper  1980.0   

      acquisitionYear width height units  
id                                        
4704           1983.0   355    273    mm  
4705           1983.0   132    143    mm  
4706           1983.0   131    155    mm  
4707           1983.0   193    267    mm  
4708

In [9]:
# Aggregate
# Mins
for name, group_df in small_df.groupby('artist'):
    min_year = group_df['acquisitionYear'].min()
    print('{}: {}'.format(name, min_year))

Frost, Sir Terry: 1983.0
Phillips, Esq Tom: 1983.0
Wols: 1983.0


In [10]:
# Transform
# Equivalent of editing by hand:
# Make a case when there is no data to infer
# small_df.loc[[11838, 16441], 'medium'] = np.nan

def fill_values(series):
    values_counted = series.value_counts()
    if values_counted.empty:
        return series
    most_frequent = values_counted.index[0]
    new_medium = series.fillna(most_frequent)
    return new_medium

In [11]:
def transform_df(source_df):
    group_dfs = []
    for name, group_df in source_df.groupby('artist'):
        filled_df = group_df.copy()
        filled_df.loc[:, 'medium'] = fill_values(group_df['medium'])
        group_dfs.append(filled_df)
        
    new_df = pd.concat(group_dfs)
    return new_df

In [12]:
# Now check the result

filled_df = transform_df(small_df)

In [13]:
# BUILT-INS
# Transform

grouped_mediums = small_df.groupby('artist')['medium']
small_df.loc[:, 'medium'] = grouped_mediums.transform(fill_values)

In [15]:
import numpy as np

In [18]:
# Min
df.groupby('artist').agg(np.min)

  df.groupby('artist').agg(np.min)


TypeError: agg function failed [how->min,dtype->object]

In [19]:
df.groupby('artist').min()

TypeError: agg function failed [how->min,dtype->object]

In [20]:
# Filter
grouped_titles = df.groupby('title')
title_counts = grouped_titles.size().sort_values(ascending = False)

In [21]:
condition = lambda x: len(x.index) > 1
dup_title_df = grouped_titles.filter(condition)
dup_title_df.sort_values('title', inplace = True)