In [1]:
from fastai.vision.utils import untar_data, URLs
# PASCAL dataset can have more than one label per image
path = untar_data(URLs.PASCAL_2007)

In [2]:
import pandas as pd

# uses CVS file to get the labels per image as opposed to file name and folder
# structure as conventions as previous single label datasets
# TODO : Is it possible to manage multiple labels per image with file/folder?
# Answer : Likely not worth it, since explicity CSV list is easy to check vs
# super fancy naming convention (experience from managing Pokemon assets lol)
df = pd.read_csv(path/'train.csv')
# inspect the CVS file by reading it into a Pandas DataFrame
# output shows that a list of categories in each image is shown as a space
# separated string
df.head()

Unnamed: 0,fname,labels,is_valid
0,000005.jpg,chair,True
1,000007.jpg,car,True
2,000009.jpg,horse person,True
3,000012.jpg,car,False
4,000016.jpg,bicycle,True


In [21]:
# PANDAS and DATAFRAMES
# Pandas is a Python library to edit and analyze tabular and time series data
# DataFrames are a table of rows and columns : the main data structure in Pandas

# Trailing :s are optional (in numpy, pytorch, pandas, etc)
# so both of these first row variants work
# first_row = df.iloc[0,:]
first_row = df.iloc[0]
first_column = df.iloc[:, 0]
print(f'--[first_row]--\n{first_row}\n--[first_column]--\n{first_column}')

# can also index by column name
fname_column = df['fname']
#print(f'--[fname_column]--\n{fname_column}')


--[first_row]--
fname       000005.jpg
labels           chair
is_valid          True
Name: 0, dtype: object
--[first_column]--
0       000005.jpg
1       000007.jpg
2       000009.jpg
3       000012.jpg
4       000016.jpg
           ...    
5006    009954.jpg
5007    009955.jpg
5008    009958.jpg
5009    009959.jpg
5010    009961.jpg
Name: fname, Length: 5011, dtype: object


In [20]:
# Create new columns and use them to do calculations
df1 = pd.DataFrame()
df1['a'] = [1,2,3, 4]
df1['b'] = [10, 20, 30, 40]
sum_column = df1['a'] + df1['b']
print(f'--[a_column]--\n{df1["a"]}\n--[b_column]--\n{df1["b"]}\n--[sum_column]--\n{sum_column}')

--[a_column]--
0    1
1    2
2    3
3    4
Name: a, dtype: int64
--[b_column]--
0    10
1    20
2    30
3    40
Name: b, dtype: int64
--[sum_column]--
0    11
1    22
2    33
3    44
dtype: int64


In [25]:
from fastai.vision.data import DataBlock

dblock = DataBlock()
dsets = dblock.datasets(df)

# a row of the DataFrame is returned TWICE ... once for the image and once for
# the label ... because the DataBlock assumes we have :
# - input
# - target
print(f'--[train]--\n{dsets.train[0]}\n--[valid]--\n{dsets.valid[0]}')
#dsets.train[0]
#dsets.valid[0]


--[train]--
(fname                  009819.jpg
labels      diningtable chair cat
is_valid                     True
Name: 4938, dtype: object, fname                  009819.jpg
labels      diningtable chair cat
is_valid                     True
Name: 4938, dtype: object)
--[valid]--
(fname       008843.jpg
labels             car
is_valid          True
Name: 4435, dtype: object, fname       008843.jpg
labels             car
is_valid          True
Name: 4435, dtype: object)


In [46]:
# We will need to capture explicitly from the DataFrame :
# - 'fname' the image file name
def get_fname(row):
    return row['fname']
# - 'labels' the list of labels
def get_labels(row):
    return row['labels']

dblock=DataBlock(get_x=get_fname, get_y=get_labels)
dsets=dblock.datasets(df)

print(f'--[train]--\n{dsets.train[0]}\n--[valid]--\n{dsets.valid[0]}')

--[train]--
('003865.jpg', 'diningtable person chair bottle')
--[valid]--
('000772.jpg', 'dog')


In [47]:
'''
#dblock = DataBlock(get_x=lambda r:r['fname'], get_y=lambda r:r['labels'])
#dsets = dblock.datasets(df)
#dsets.train[0]
'''

"\n#dblock = DataBlock(get_x=lambda r:r['fname'], get_y=lambda r:r['labels'])\n#dsets = dblock.datasets(df)\n#dsets.train[0]\n"