<a href="https://colab.research.google.com/github/pabloinsente/CovNet_Human_Drawings/blob/master/CovNet_VGG_19_Age_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Age prediction from features extracted from drawings with VGG-19 CovNet

In [4]:
# install seaborn for pairplot
!pip install -q seaborn
# install rarfile to read feature file
!pip install rarfile

Collecting rarfile
[?25l  Downloading https://files.pythonhosted.org/packages/de/a4/8b4abc72310da6fa53b6de8de1019e0516885d05369d6c91cba23476abe5/rarfile-3.0.tar.gz (110kB)
[K    100% |████████████████████████████████| 112kB 2.7MB/s 
[?25hBuilding wheels for collected packages: rarfile
  Building wheel for rarfile (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/dc/84/da/8aff50941f548db5384b076d5a6a6afea0cd12672e0326edc4
Successfully built rarfile
Installing collected packages: rarfile
Successfully installed rarfile-3.0


In [2]:
from __future__ import absolute_import, division, print_function

import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import rarfile
import csv


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

1.13.1


In [3]:
# Clone the data into Colab
! git clone https://github.com/pabloinsente/CovNet_Human_Drawings
# Run this just once per sesion

Cloning into 'CovNet_Human_Drawings'...
remote: Enumerating objects: 498, done.[K
remote: Counting objects: 100% (498/498), done.[K
remote: Compressing objects: 100% (434/434), done.[K
remote: Total 498 (delta 93), reused 438 (delta 60), pack-reused 0[K
Receiving objects: 100% (498/498), 43.07 MiB | 44.20 MiB/s, done.
Resolving deltas: 100% (93/93), done.


## Data wrangling - Dataframe for Neural Net

In [20]:
# Read csv files from compressed rar file and convert into a dataframe
rar_path = rarfile.RarFile("CovNet_Human_Drawings/data/vectors_features/vgg19_vectors_drawings_block5_pool_all.rar")
csv_file_name = "vgg19_vectors_drawings_block5_pool_all.csv"
rar_file = rarfile.RarFile.open(rar_path, csv_file_name)
raw_dataset = pd.read_csv(rar_file, sep=",", header=None)
print(raw_dataset.shape)
print(type(raw_dataset))

(258, 25088)
<class 'pandas.core.frame.DataFrame'>


In [49]:
# create headers for vectors
n_vectors = raw_dataset.shape[1]
col_list =[]
[col_list.append("vector_{}".format(vector)) for vector in range(n_vectors)]
print(col_list[0:5])
print(len(col_list))

['vector_0', 'vector_1', 'vector_2', 'vector_3', 'vector_4']
25088


In [38]:
# add headers to vectors in raw dataframe
raw_dataset.columns = col_list
print(raw_dataset.head())

   vector_0  vector_1  vector_2  vector_3  vector_4  vector_5  vector_6  \
0       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
1       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
2       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
3       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
4       0.0       0.0       0.0       0.0       0.0       0.0       0.0   

   vector_7  vector_8  vector_9      ...       vector_25078  vector_25079  \
0       0.0       0.0       0.0      ...                0.0           0.0   
1       0.0       0.0       0.0      ...                0.0           0.0   
2       0.0       0.0       0.0      ...                0.0           0.0   
3       0.0       0.0       0.0      ...                0.0           0.0   
4       0.0       0.0       0.0      ...                0.0           0.0   

   vector_25080  vector_25081  vector_25082  vector_25083  vector_25084  \
0          

In [18]:
# Get the drwawings filenames from directory 
from os import listdir
from os.path import isfile, join

path = 'CovNet_Human_Drawings/data/human_drawings_all/'
filenames = [f for f in listdir(path) if isfile(join(path, f))]
len(filenames) # This should yield 258 

258

In [98]:
# Split the strings by the underscore and return the first element (row id for merging)
filenames_crop = []
n_id = len(filenames)
[filenames_crop.append(filenames[id].split('_')[0]) for id in range(n_id)]
print(filenames_crop[0:5])
print(len(filenames_crop)) # this should yield 258

['DAM058', 'DAM041', 'DAM078', 'DAM027', 'DAM027']
258


In [99]:
# There is "a" in the filenames. To merge this dataframe with the metadata, 
# we need to change that to uppercasem e.g. DAMa025 needs to change to DAMA025
filenames_up = [x.upper() for x in filenames_crop]
print(filenames_up[0:5])
print(len(filenames_up)) # this should yield 258

['DAM058', 'DAM041', 'DAM078', 'DAM027', 'DAM027']
258


In [54]:
# Get rows ids as dataframe
id_list = pd.DataFrame(filenames_up, columns=['id'])
print(len(id_list)) # This should yield 258 
print(type(id_list))
print(id_list.head())

258
<class 'pandas.core.frame.DataFrame'>
       id
0  DAM058
1  DAM041
2  DAM078
3  DAM027
4  DAM027


In [55]:
# merge vector's id and raw vectors
df_vectors = pd.concat([id_list, raw_dataset], axis=1)
print(df_vectors.shape) # this should yield (258, 25089)
print(df_vectors.head())

(258, 25089)
       id  vector_0  vector_1  vector_2  vector_3  vector_4  vector_5  \
0  DAM058       0.0       0.0       0.0       0.0       0.0       0.0   
1  DAM041       0.0       0.0       0.0       0.0       0.0       0.0   
2  DAM078       0.0       0.0       0.0       0.0       0.0       0.0   
3  DAM027       0.0       0.0       0.0       0.0       0.0       0.0   
4  DAM027       0.0       0.0       0.0       0.0       0.0       0.0   

   vector_6  vector_7  vector_8      ...       vector_25078  vector_25079  \
0       0.0       0.0       0.0      ...                0.0           0.0   
1       0.0       0.0       0.0      ...                0.0           0.0   
2       0.0       0.0       0.0      ...                0.0           0.0   
3       0.0       0.0       0.0      ...                0.0           0.0   
4       0.0       0.0       0.0      ...                0.0           0.0   

   vector_25080  vector_25081  vector_25082  vector_25083  vector_25084  \
0         

In [57]:
# read metadata as pandas dataframe
meta_patah= "CovNet_Human_Drawings/data/metadata_participants/Study 1 DAM masterdata053117.csv"
df_metadata = pd.read_csv(meta_patah, sep=",")
print(df_metadata.shape)
print(df_metadata.head())

(107, 397)
       id  drop IS_medium_order IS_shape_order IS_site  age_yr  age4g  female  \
0  DAM001     0             SPF            ABC     PSL    5.70      3     0.0   
1  DAM002     0             FSP            ACB     PSL    5.42      3     1.0   
2  DAM003     0             FSP            BAC     PSL    4.53      2     1.0   
3  DAM004     0             SPF            BCA     PSL    5.37      3     0.0   
4  DAM005     0             PFS            CBA     PSL    4.19      2     0.0   

   adult  child        ...         PS_enjoy_electronic  PS_min_traditional  \
0    0.0    1.0        ...                         NaN                 NaN   
1    0.0    1.0        ...                         7.0                  20   
2    0.0    1.0        ...                         NaN                  30   
3    0.0    1.0        ...                         NaN                 NaN   
4    0.0    1.0        ...                        10.0                  15   

   PS_min_electornic             

In [58]:
# subset metadata; get id and age columns
df_metadata_age = df_metadata[['id','age_yr']]
print(df_metadata_age.shape)
print(df_metadata_age.head())

(107, 2)
       id  age_yr
0  DAM001    5.70
1  DAM002    5.42
2  DAM003    4.53
3  DAM004    5.37
4  DAM005    4.19


In [60]:
# merge metadata and vectors by id
df_predict_age = pd.merge(df_metadata_age, df_vectors, on='id')
print(df_predict_age.shape) # this should yield (258, 25090)
print(df_predict_age.head())

(258, 25090)
       id  age_yr  vector_0  vector_1  vector_2  vector_3  vector_4  vector_5  \
0  DAM001    5.70       0.0       0.0       0.0       0.0  0.000000  0.000000   
1  DAM001    5.70       0.0       0.0       0.0       0.0  0.000000  8.308172   
2  DAM001    5.70       0.0       0.0       0.0       0.0  2.121198  0.000000   
3  DAM002    5.42       0.0       0.0       0.0       0.0  0.000000  0.000000   
4  DAM002    5.42       0.0       0.0       0.0       0.0  0.000000  0.000000   

   vector_6  vector_7      ...       vector_25078  vector_25079  vector_25080  \
0       0.0       0.0      ...                0.0           0.0      0.000000   
1       0.0       0.0      ...                0.0           0.0     10.788058   
2       0.0       0.0      ...                0.0           0.0      0.000000   
3       0.0       0.0      ...                0.0           0.0     29.385056   
4       0.0       0.0      ...                0.0           0.0      0.000000   

   vector_250

In [0]:
# Let's export the pandas dataframe to a csv for later
df_predict_age.to_csv("predict_age_from_vectors_block5_pool_all.csv", sep=',')

## Clean the data 

In [70]:
# print NA values
NA = df_predict_age.isna().sum()
print(NA[0:2]) # we have 11 rows with missing values for age

id         0
age_yr    11
dtype: int64


In [0]:
from fancyimpute import IterativeImputer
# IterativeImputer: A strategy for imputing missing values by modeling each 
# feature with missing values as a function of other features in a round-robin fashion.

# IterativeImputer only works with scalar values so we have to get rid of 
# id label first 
df_predict_age_na = df_predict_age_na = df_predict_age.loc[:, df_predict_age.columns != 'id']

# now we can do the imputation 
df_predict_age_imp = IterativeImputer().fit_transform(df_predict_age_na)

In [83]:
# IterativeImputer returns a np array, so we need to change it back to pandas
df_predict_age_pn = pd.DataFrame(df_predict_age_imp)
print(type(df_predict_age_pn))
print(df_predict_age_pn.head())

<class 'pandas.core.frame.DataFrame'>
   0      1      2      3      4         5         6      7      8      9      \
0   5.70    0.0    0.0    0.0    0.0  0.000000  0.000000    0.0    0.0    0.0   
1   5.70    0.0    0.0    0.0    0.0  0.000000  8.308172    0.0    0.0    0.0   
2   5.70    0.0    0.0    0.0    0.0  2.121198  0.000000    0.0    0.0    0.0   
3   5.42    0.0    0.0    0.0    0.0  0.000000  0.000000    0.0    0.0    0.0   
4   5.42    0.0    0.0    0.0    0.0  0.000000  0.000000    0.0    0.0    0.0   

   ...    25079  25080      25081  25082     25083      25084  25085  25086  \
0  ...      0.0    0.0   0.000000    0.0  0.630956   0.000000    0.0    0.0   
1  ...      0.0    0.0  10.788058    0.0  0.000000  22.443872    0.0    0.0   
2  ...      0.0    0.0   0.000000    0.0  0.000000   0.000000    0.0    0.0   
3  ...      0.0    0.0  29.385056    0.0  0.000000   0.000000    0.0    0.0   
4  ...      0.0    0.0   0.000000    0.0  1.291875   0.000000    0.0    0.0   



In [91]:
# merge vector's id and raw vectors
df_predict_age_fix = pd.concat([id_list, df_predict_age_pn], axis=1)
print(df_predict_age_fix.shape) # this should yield (258, 25089)
print(df_predict_age_fix.head())

(258, 25090)
       id     0    1    2    3    4         5         6    7    8  ...    \
0  DAM058  5.70  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  ...     
1  DAM041  5.70  0.0  0.0  0.0  0.0  0.000000  8.308172  0.0  0.0  ...     
2  DAM078  5.70  0.0  0.0  0.0  0.0  2.121198  0.000000  0.0  0.0  ...     
3  DAM027  5.42  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  ...     
4  DAM027  5.42  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  ...     

   25079  25080      25081  25082     25083      25084  25085  25086  25087  \
0    0.0    0.0   0.000000    0.0  0.630956   0.000000    0.0    0.0    0.0   
1    0.0    0.0  10.788058    0.0  0.000000  22.443872    0.0    0.0    0.0   
2    0.0    0.0   0.000000    0.0  0.000000   0.000000    0.0    0.0    0.0   
3    0.0    0.0  29.385056    0.0  0.000000   0.000000    0.0    0.0    0.0   
4    0.0    0.0   0.000000    0.0  1.291875   0.000000    0.0    0.0    0.0   

   25088  
0    0.0  
1    0.0  
2    0.0  
3    0.0  


In [97]:
# let's get the original col names to add that back to the new dataframe
col_names = list(df_predict_age.columns.values)
print(col_names[0:5])

['id', 'age_yr', 'vector_0', 'vector_1', 'vector_2']


In [93]:
# add headers to vectors into dataframe after imputation
df_predict_age_fix.columns = col_names
print(df_predict_age_fix.head())

       id  age_yr  vector_0  vector_1  vector_2  vector_3  vector_4  vector_5  \
0  DAM058    5.70       0.0       0.0       0.0       0.0  0.000000  0.000000   
1  DAM041    5.70       0.0       0.0       0.0       0.0  0.000000  8.308172   
2  DAM078    5.70       0.0       0.0       0.0       0.0  2.121198  0.000000   
3  DAM027    5.42       0.0       0.0       0.0       0.0  0.000000  0.000000   
4  DAM027    5.42       0.0       0.0       0.0       0.0  0.000000  0.000000   

   vector_6  vector_7      ...       vector_25078  vector_25079  vector_25080  \
0       0.0       0.0      ...                0.0           0.0      0.000000   
1       0.0       0.0      ...                0.0           0.0     10.788058   
2       0.0       0.0      ...                0.0           0.0      0.000000   
3       0.0       0.0      ...                0.0           0.0     29.385056   
4       0.0       0.0      ...                0.0           0.0      0.000000   

   vector_25081  vector_25

In [94]:
# print NA values before imputation
NA = df_predict_age.isna().sum()
print(NA[0:2]) # we have 11 rows with missing values for age

# print NA values after imputation
NA_imp = df_predict_age_fix.isna().sum()
print(NA_imp[0:2]) # Now we have 0 NA values in our new dataframe

id         0
age_yr    11
dtype: int64
id        0
age_yr    0
dtype: int64


In [0]:
# Let's export the pandas dataframe to a csv for later
df_predict_age_fix.to_csv("predict_age_from_vectors_block5_pool_all_imp.csv", sep=',')

## Split the data into train and test for training

To avoid re-running the data wrangling process, now we can load the dataset from memory. We can skip everything above this section by running the two cells below

In [0]:
from __future__ import absolute_import, division, print_function

import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import rarfile
import csv


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

In [0]:
# Run this only if you haven't clone the data before
# Clone the data into Colab
! git clone https://github.com/pabloinsente/CovNet_Human_Drawings

In [0]:
df_path = "CovNet_Human_Drawings/data/merged_dataframes_prediction/predict_age_from_vectors_block5_pool_all_imp.csv"
df_net = pd.read_csv(df_path, sep=",")
print(df_net.shape)
print(df_net.head())

In [0]:
# We will use 80% of the data for training and 20% for testing
train_dataset = df_net.sample(frac=0.8,random_state=0)
test_dataset = df_net.drop(train_dataset.index)

In [0]:
# We will skip exploring the data because we have ~25,000 uninterpretable features

## Split features from labels


In [0]:
# Separate the target value, or "label", from the features, "agre_yr" 
# This label is the value that we will train the model to predict.
train_labels = train_dataset.pop('age_yr')
test_labels = test_dataset.pop('age_yr')