<a href="https://colab.research.google.com/github/pabloinsente/CovNet_Human_Drawings/blob/master/code/CovNet_VGG_19_preprocessing_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing for age classification from features extracted from drawings with VGG-19 CovNet

In [0]:
# install seaborn for pairplot
!pip install -q seaborn
# install rarfile to read feature file
!pip install rarfile
# update pandas
!pip install --upgrade pandas

In [27]:
import pathlib

import pandas as pd
import rarfile
import csv

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

1.13.1


In [0]:
# Clone the data into Colab
! git clone https://github.com/pabloinsente/CovNet_Human_Drawings
# Run this just once per sesion

# Data Wrangling

## Reading dataframe from CovNet for Neural Net

In [28]:
# Read csv files from compressed rar file and convert into a dataframe
rar_path = rarfile.RarFile("CovNet_Human_Drawings/data/vectors_features/vgg19_vectors_drawings_block5_pool_all.rar")
csv_file_name = "vgg19_vectors_drawings_block5_pool_all.csv"
rar_file = rarfile.RarFile.open(rar_path, csv_file_name)
raw_dataset = pd.read_csv(rar_file, sep=",", header=None)
print(raw_dataset.shape)
print(type(raw_dataset))

(258, 25088)
<class 'pandas.core.frame.DataFrame'>


In [29]:
# create headers for vectors
n_vectors = raw_dataset.shape[1]
col_list =[]
[col_list.append("vector_{}".format(vector)) for vector in range(n_vectors)]
print(col_list[0:5])
print(len(col_list))

['vector_0', 'vector_1', 'vector_2', 'vector_3', 'vector_4']
25088


In [30]:
# add headers to vectors in raw dataframe
raw_dataset.columns = col_list
print(raw_dataset.iloc[0:10, 0:6])

   vector_0  vector_1  vector_2  vector_3  vector_4  vector_5
0       0.0       0.0       0.0       0.0  0.000000       0.0
1       0.0       0.0       0.0       0.0  0.000000       0.0
2       0.0       0.0       0.0       0.0  0.000000       0.0
3       0.0       0.0       0.0       0.0  0.000000       0.0
4       0.0       0.0       0.0       0.0  0.000000       0.0
5       0.0       0.0       0.0       0.0  0.979608       0.0
6       0.0       0.0       0.0       0.0  0.000000       0.0
7       0.0       0.0       0.0       0.0  0.000000       0.0
8       0.0       0.0       0.0       0.0  0.000000       0.0
9       0.0       0.0       0.0       0.0  0.000000       0.0


In [31]:
# Get the drwawings filenames from directory 
from os import listdir
from os.path import isfile, join

path = 'CovNet_Human_Drawings/data/human_drawings_all/'
filenames = [f for f in listdir(path) if isfile(join(path, f))]
len(filenames) # This should yield 258 

258

In [32]:
# Split the strings by the underscore and return the first element (row id for merging)
filenames_crop = []
n_id = len(filenames)
[filenames_crop.append(filenames[id].split('_')[0]) for id in range(n_id)]
print(filenames_crop[0:5])
print(len(filenames_crop)) # this should yield 258

['DAM031', 'DAM037', 'DAM003', 'DAM020', 'DAM071']
258


In [33]:
# There is "a" in the filenames. To merge this dataframe with the metadata, 
# we need to change that to uppercasem e.g. DAMa025 needs to change to DAMA025
filenames_up = [x.upper() for x in filenames_crop]
print(filenames_up[0:5])
print(len(filenames_up)) # this should yield 258

['DAM031', 'DAM037', 'DAM003', 'DAM020', 'DAM071']
258


In [34]:
# Get rows ids as dataframe
id_list = pd.DataFrame(filenames_up, columns=['id'])
print(len(id_list)) # This should yield 258 
print(type(id_list))
print(id_list.head())

258
<class 'pandas.core.frame.DataFrame'>
       id
0  DAM031
1  DAM037
2  DAM003
3  DAM020
4  DAM071


In [35]:
# merge vector's id and raw vectors
df_vectors = pd.concat([id_list, raw_dataset], axis=1)
print(df_vectors.shape) # this should yield (258, 25089)
print(df_vectors.iloc[0:10, 0:6])


(258, 25089)
       id  vector_0  vector_1  vector_2  vector_3  vector_4
0  DAM031       0.0       0.0       0.0       0.0  0.000000
1  DAM037       0.0       0.0       0.0       0.0  0.000000
2  DAM003       0.0       0.0       0.0       0.0  0.000000
3  DAM020       0.0       0.0       0.0       0.0  0.000000
4  DAM071       0.0       0.0       0.0       0.0  0.000000
5  DAM076       0.0       0.0       0.0       0.0  0.979608
6  DAM016       0.0       0.0       0.0       0.0  0.000000
7  DAM028       0.0       0.0       0.0       0.0  0.000000
8  DAM077       0.0       0.0       0.0       0.0  0.000000
9  DAM071       0.0       0.0       0.0       0.0  0.000000


In [36]:
# read metadata as pandas dataframe
meta_path= "CovNet_Human_Drawings/data/metadata_participants/Study 1 DAM masterdata053117.csv"
df_metadata = pd.read_csv(meta_path, sep=",")
print(df_metadata.shape)
print(df_metadata.iloc[0:10, 0:6])


(107, 397)
        id  drop IS_medium_order IS_shape_order    IS_site  age_yr
0   DAM001     0             SPF            ABC        PSL    5.70
1   DAM002     0             FSP            ACB        PSL    5.42
2   DAM003     0             FSP            BAC        PSL    4.53
3   DAM004     0             SPF            BCA        PSL    5.37
4   DAM005     0             PFS            CBA        PSL    4.19
5   DAM006     0             FSP            CAB        PSL    4.30
6   DAM007     0             SPF            CAB        PSL    4.20
7   DAM008     0             PFS            BCA        PSL    4.40
8   DAM009     0             FSP            CBA        PSL    4.02
9  DAM00T1     1             NaN            NaN  LehmanLab    7.90


In [37]:
# subset metadata; get id and age columns
df_metadata_age = df_metadata[['id','age_yr', 'adult']]
print(df_metadata_age.shape)
print(df_metadata_age.iloc[0:10])
#print(df_metadata_age.sort_values(by=['id'])[0:10])

(107, 3)
        id  age_yr  adult
0   DAM001    5.70      0
1   DAM002    5.42      0
2   DAM003    4.53      0
3   DAM004    5.37      0
4   DAM005    4.19      0
5   DAM006    4.30      0
6   DAM007    4.20      0
7   DAM008    4.40      0
8   DAM009    4.02      0
9  DAM00T1    7.90      0


In [38]:
# check nan values 
df_metadata_age.isna().sum()
# there are missing values for age_yr that we will fix later

id        0
age_yr    5
adult     0
dtype: int64

In [39]:
# merge metadata and vectors by id
df_predict_age = pd.merge(df_metadata_age, df_vectors, on='id')
print(df_predict_age.shape) # this should yield (258, 25090)
print(df_predict_age.iloc[0:10, 0:6])
#print(df_predict_age.sort_values(by=['id']).iloc[0:10, 0:5])

(258, 25091)
       id  age_yr  adult  vector_0  vector_1  vector_2
0  DAM001    5.70      0       0.0       0.0       0.0
1  DAM001    5.70      0       0.0       0.0       0.0
2  DAM001    5.70      0       0.0       0.0       0.0
3  DAM002    5.42      0       0.0       0.0       0.0
4  DAM002    5.42      0       0.0       0.0       0.0
5  DAM002    5.42      0       0.0       0.0       0.0
6  DAM003    4.53      0       0.0       0.0       0.0
7  DAM003    4.53      0       0.0       0.0       0.0
8  DAM003    4.53      0       0.0       0.0       0.0
9  DAM004    5.37      0       0.0       0.0       0.0


In [0]:
# export the pandas dataframe to a csv
df_predict_age.to_csv("predict_age_from_vectors_block5_pool_all.csv", sep=',', index= False)

## Cleaning the data 

In [40]:
# print NA values
NA = df_predict_age.isna().sum()
print(NA[0:2]) # we have 11 rows with missing values for age

id         0
age_yr    11
dtype: int64


In [41]:
from fancyimpute import IterativeImputer
# IterativeImputer: A strategy for imputing missing values by modeling each 
# feature with missing values as a function of other features in a round-robin fashion.

# IterativeImputer only works with scalar values so we have to get rid of 
# id label first 
df_predict_age_na = df_predict_age.loc[:, df_predict_age.columns != 'id']

# now we can do the imputation 
df_predict_age_imp = IterativeImputer().fit_transform(df_predict_age_na)

Using TensorFlow backend.


In [42]:
# IterativeImputer returns a np array, so we need to change it back to pandas
df_predict_age_pn = pd.DataFrame(df_predict_age_imp)
print(type(df_predict_age_pn))
print(df_predict_age_pn.iloc[0:10, 0:5])

<class 'pandas.core.frame.DataFrame'>
      0    1    2    3    4
0  5.70  0.0  0.0  0.0  0.0
1  5.70  0.0  0.0  0.0  0.0
2  5.70  0.0  0.0  0.0  0.0
3  5.42  0.0  0.0  0.0  0.0
4  5.42  0.0  0.0  0.0  0.0
5  5.42  0.0  0.0  0.0  0.0
6  4.53  0.0  0.0  0.0  0.0
7  4.53  0.0  0.0  0.0  0.0
8  4.53  0.0  0.0  0.0  0.0
9  5.37  0.0  0.0  0.0  0.0


In [58]:
# merge vector's id and raw vectors
id_list_sort = id_list.sort_values(by=["id"])

id_list_sort.reset_index(drop=True, inplace=True)

df_predict_age_fix = pd.concat([id_list_sort, df_predict_age_pn], axis=1)

print(df_predict_age_fix.shape) # this should yield (258, 25089)
print(df_predict_age_fix.iloc[0:9,0:7])

(258, 25091)
       id     0    1    2    3    4    5
0  DAM001  5.70  0.0  0.0  0.0  0.0  0.0
1  DAM001  5.70  0.0  0.0  0.0  0.0  0.0
2  DAM001  5.70  0.0  0.0  0.0  0.0  0.0
3  DAM002  5.42  0.0  0.0  0.0  0.0  0.0
4  DAM002  5.42  0.0  0.0  0.0  0.0  0.0
5  DAM002  5.42  0.0  0.0  0.0  0.0  0.0
6  DAM003  4.53  0.0  0.0  0.0  0.0  0.0
7  DAM003  4.53  0.0  0.0  0.0  0.0  0.0
8  DAM003  4.53  0.0  0.0  0.0  0.0  0.0


In [59]:
# let's get the original col names to add that back to the new dataframe
col_names = list(df_predict_age.columns.values)
print(col_names[0:5])

['id', 'age_yr', 'adult', 'vector_0', 'vector_1']


In [60]:
# add headers to vectors into dataframe after imputation
df_predict_age_fix.columns = col_names
print(df_predict_age_fix.iloc[0:9,0:5])

       id  age_yr  adult  vector_0  vector_1
0  DAM001    5.70    0.0       0.0       0.0
1  DAM001    5.70    0.0       0.0       0.0
2  DAM001    5.70    0.0       0.0       0.0
3  DAM002    5.42    0.0       0.0       0.0
4  DAM002    5.42    0.0       0.0       0.0
5  DAM002    5.42    0.0       0.0       0.0
6  DAM003    4.53    0.0       0.0       0.0
7  DAM003    4.53    0.0       0.0       0.0
8  DAM003    4.53    0.0       0.0       0.0


In [61]:
# print NA values before imputation

print("BEFORE imputation")
NA = df_predict_age.isna().sum()
print(NA[0:2],"\n") # we have 11 rows with missing values for age

print("AFTER imputation")
# print NA values after imputation
NA_imp = df_predict_age_fix.isna().sum()
print(NA_imp[0:2]) # Now we have 0 NA values in our new dataframe

BEFORE imputation
id         0
age_yr    11
dtype: int64 

AFTER imputation
id        0
age_yr    0
dtype: int64


In [0]:
# Let's export the pandas dataframe to a csv for later
df_predict_age_fix.to_csv("predict_age_from_vectors_block5_pool_all_imp.csv", sep=',', index=False)

# Preprocessing data for Neural Network

## Split labels from features

In [0]:
df_id = df_predict_age_fix.pop('id') # pop id
df_age = df_predict_age_fix.pop('age_yr') # pop age labels
df_adult = df_predict_age_fix.pop('adult') # pop adult labels

In [63]:
print(df_id[0:3])
print(df_age[0:3])
print(df_adult[0:3])

0    DAM001
1    DAM001
2    DAM001
Name: id, dtype: object
0    5.7
1    5.7
2    5.7
Name: age_yr, dtype: float64
0    0.0
1    0.0
2    0.0
Name: adult, dtype: float64


In [64]:
# change to numpy arrays
df_age_array = df_age.to_numpy(dtype='float64')
df_adult_array = df_adult.to_numpy(dtype='float64')
print(type(df_age_array))
print(type(df_adult_array))
print(df_age_array[0:9,])
print(df_adult_array[0:9,])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[5.7  5.7  5.7  5.42 5.42 5.42 4.53 4.53 4.53]
[0. 0. 0. 0. 0. 0. 0. 0. 0.]


## Standarize

In [66]:
# standarize with mean = 0, sd = 1

# from pandas to numpy array
df_net_array = df_predict_age_fix.to_numpy(dtype='float64')

# standarize
scaler = StandardScaler().fit(df_net_array)
df_net_array_res = scaler.transform(df_net_array)
# summarize transformed data
np.set_printoptions(precision=3)
print(df_net_array_res[0:9,0:9])

[[-0.097  0.     0.    -0.097 -0.235 -0.11  -0.064 -0.088  0.   ]
 [-0.097  0.     0.    -0.097 -0.235 -0.11  -0.064 -0.088  0.   ]
 [-0.097  0.     0.    -0.097 -0.235 -0.11  -0.064 -0.088  0.   ]
 [-0.097  0.     0.    -0.097 -0.111 -0.11  -0.064 -0.088  0.   ]
 [-0.097  0.     0.    -0.097  0.931 -0.11  -0.064 -0.088  0.   ]
 [-0.097  0.     0.    -0.097 -0.235 -0.11  -0.064 -0.088  0.   ]
 [-0.097  0.     0.    -0.097 -0.235 -0.11  -0.064 -0.088  0.   ]
 [-0.097  0.     0.    -0.097 -0.155 -0.11  -0.064 -0.088  0.   ]
 [-0.097  0.     0.    -0.097 -0.235 -0.11  -0.064 -0.088  0.   ]]


## Normalize

In [67]:
# normalize with lenght = 1

scaler = Normalizer().fit(df_net_array_res)
df_net_array_res_norm = scaler.transform(df_net_array_res)
# summarize transformed data
np.set_printoptions(precision=3)
print(df_net_array_res_norm[0:9,0:9])

[[-0.001  0.     0.    -0.001 -0.003 -0.001 -0.001 -0.001  0.   ]
 [-0.001  0.     0.    -0.001 -0.003 -0.001 -0.001 -0.001  0.   ]
 [-0.001  0.     0.    -0.001 -0.003 -0.001 -0.001 -0.001  0.   ]
 [-0.001  0.     0.    -0.001 -0.001 -0.001 -0.001 -0.001  0.   ]
 [-0.001  0.     0.    -0.001  0.011 -0.001 -0.001 -0.001  0.   ]
 [-0.001  0.     0.    -0.001 -0.002 -0.001 -0.001 -0.001  0.   ]
 [-0.001  0.     0.    -0.001 -0.002 -0.001 -0.001 -0.001  0.   ]
 [-0.002  0.     0.    -0.002 -0.003 -0.002 -0.001 -0.001  0.   ]
 [-0.001  0.     0.    -0.001 -0.002 -0.001 -0.001 -0.001  0.   ]]


## Compare raw, standarized, and normalized max values

In [68]:
print("Max values")
print(np.amax(df_net_array))
print(np.amax(df_net_array_res))
print(np.amax(df_net_array_res_norm), "\n")
print("Min values")
print(np.amin(df_net_array))
print(np.amin(df_net_array_res))
print(np.amin(df_net_array_res_norm), "\n")
print("Shape")
print(np.shape(df_net_array_res_norm))

Max values
265.2994079589844
16.03121954188141
0.35538105580639745 

Min values
0.0
-7.156054591501433
-0.05109213937293871 

Shape
(258, 25088)


## Save data

###Save features dataframe

In [0]:
np.savetxt("x_drawings_features_max_pool_5.csv", df_net_array_res_norm, delimiter=",")

###Save labels dataframes

In [0]:
# Save labels for age prediction
np.savetxt("y_age_years_labels.csv", df_age_array, delimiter=",")
# Save labels for age classification
np.savetxt("y_age_adult_labels.csv", df_adult_array, delimiter=",")

# Resources

- For imputation of missing values 
https://github.com/iskandr/fancyimpute



