# Hylas - custom prediction using Sherlock methods

In [1]:
%load_ext autoreload
%autoreload 2

## Generate data sets from extracted CSV product column data

In [2]:
import os
import re
from ast import literal_eval
import pandas
import pyarrow.parquet as pq
import pyarrow.csv as pc
from datetime import datetime
import multiprocessing


def tryeval(val):
  try:
    val=literal_eval(val)
  except (ValueError, SyntaxError):
    pass
  return val


# Declare pool AFTER methods that will be later called from pool.
pool = multiprocessing.Pool(os.cpu_count())


pattern = re.compile("^(article|product)_")

by_key = '/Users/lowecg/mapped/attributes/brand-to-canonical/9/by-key'

print(f'Treating file data with {os.cpu_count()} cores')


start = datetime.now()

df_labels = pandas.DataFrame(columns=['type'])
df_samples = pandas.DataFrame(columns=['values'])

idx = 0

for file_name in os.listdir(by_key):
  if pattern.match(file_name):
    print(f'Processing "{file_name}"')
    
    with open (by_key + '/' + file_name, 'r') as attributes_file:
      data=attributes_file.readlines()
        
      idx += 1      
        
      df_labels.loc[idx, 'type'] = file_name
    
      unique = list(pool.map(str.strip, set(data)))
        
      to_store = str(list(pool.map(tryeval, unique)))
    
      df_samples.loc[idx, 'values'] = to_store
        
  else:
    print('IGNORED: ', file_name)

    
print('Saving to parquet')
    
df_labels.to_parquet(fname='myfile_labels.parquet',engine='auto',compression='snappy')
df_samples.to_parquet(fname='myfile_values.parquet',engine='auto',compression='snappy')

end = datetime.now()
x = end - start
print(f'Write process took {x} seconds.')

Treating file data with 8 cores
Processing "product_brandGender2"
Processing "product_ageGroup1"
Processing "product_sportPurpose3"
Processing "product_sportPurpose4"
Processing "product_sportPurpose2"
Processing "product_brandCategory4_footwear"
Processing "article_brandColourName"
Processing "product_brandCategory1"
Processing "article_id"
Processing "product_brandGender1"
Processing "product_type"
Processing "article_brandSize"
Processing "product_ageGroup2"
Processing "product_brandSeason"
Processing "product_brandSizeGrid"
Processing "product_sportPurpose1"
Processing "product_primaryCategoryIdCode"
Processing "product_id"
Processing "product_brandCategory3"
Processing "product_brandCategory2"
Saving to parquet
Write process took 0:00:03.630735 seconds.


Process ForkPoolWorker-7:
Process ForkPoolWorker-5:
Process ForkPoolWorker-8:
Process ForkPoolWorker-3:
Process ForkPoolWorker-2:
Process ForkPoolWorker-6:
Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/opt/python@3.7/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/opt/python@3.7/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/opt/python@3.7/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/opt/python@3.7/Framewo

In [3]:
import pandas as pd

print('Creating dataframe task (myfile_samples): ', datetime.now())

myfile_labels = pd.read_parquet('myfile_labels.parquet')
myfile_samples = pd.read_parquet('myfile_values.parquet')

#print('Starting task (myfile_samples): ', datetime.now())

# this operation is now handled by convert_string_lists_to_lists in a later step
#myfile_samples = myfile_samples['values'].apply(literal_eval)

#print('Finished task (myfile_samples): ', datetime.now())

Creating dataframe task (myfile_samples):  2020-12-16 18:23:58.014109


In [4]:
myfile_labels.head(50)

Unnamed: 0,type
1,product_brandGender2
2,product_ageGroup1
3,product_sportPurpose3
4,product_sportPurpose4
5,product_sportPurpose2
6,product_brandCategory4_footwear
7,article_brandColourName
8,product_brandCategory1
9,article_id
10,product_brandGender1


In [5]:
myfile_samples.head(20)

Unnamed: 0,values
1,"['Kids Unisex', 'Infants', 'Girls', 'Boys', 'KU']"
2,"['K', 'M', 'W', 'Unisex', 'U', 'Kids', 'Women'..."
3,"['Training', 'Skateboarding', 'Urban Outdoor',..."
4,"['Field Hockey', 'Training', 'Studio', 'Yoga',..."
5,"['Training', 'Skateboarding', 'Urban Outdoor',..."
6,"['Performance', 'Porsche Design Sport by adida..."
7,"['White/Clear Grey/Real Red', 'Silver Met. / H..."
8,"['Accessories', 'Clothing', 'Shoes']"
9,"[4058025845355, 4061619900118, 4055338882862, ..."
10,"['K', 'M', 'W', 'Unisex', 'U', 'Kids', 'Women'..."


In [6]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [7]:
test_samples_converted, y_test = convert_string_lists_to_lists(myfile_samples, myfile_labels, "values", "type")

100%|██████████| 20/20 [00:03<00:00,  5.19it/s]


In [8]:
test_samples_converted.head()

1              [Kids Unisex, Infants, Girls, Boys, KU]
2               [K, M, W, Unisex, U, Kids, Women, Men]
3    [Training, Skateboarding, Urban Outdoor, Runni...
4    [Field Hockey, Training, Studio, Yoga, Handbal...
5    [Training, Skateboarding, Urban Outdoor, Cycli...
Name: values, dtype: object

### Given that feature extraction can take long, we only take the first 100 samples.

In [9]:
y_test_subset = y_test[:100]

In [10]:
%load_ext line_profiler

In [143]:
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model

initialise_pretrained_model(400)
initialise_word_embeddings()

Initialise Doc2Vec Model, 400 dim, process took 0:00:00.367545 seconds.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:05.681946 seconds.


In [177]:
start = datetime.now()

%lprun -m sherlock.features.preprocessing X_test=extract_features(test_samples_converted.head(n=100)) 



#X_test = extract_features(test_samples_converted.head(n=100))

end = datetime.now()
x = end - start
print(f'Extract Features process took {x} seconds.')

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Extract Features process took 0:00:01.340000 seconds.


In [139]:
# Baseline (before changes)
# Extract Features process took 0:03:11.954844 seconds.

# Tuning iterations
# Extract Features process took 0:00:13.869183 seconds. (cache Word Embeddings)
# Extract Features process took 0:00:06.143361 seconds. (cache Doc2Vec)
# Extract Features process took 0:00:01.308678 seconds. (improved computation for bag of character features)
# Extract Features process took 0:00:01.259591 seconds. (smaller optimisation tweaks - string cat, removal of double compute of some stats)

In [148]:
X_test.head(20)

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.0004,-0.000995,-0.001024,-6.5e-05,0.000573,-1.7e-05,0.001084,0.001129,-0.000982,0.000995
1,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.001176,-0.000821,-3.9e-05,0.000861,0.001157,-0.000436,0.000214,0.000536,0.001168,-0.000712
2,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000822,-0.000257,0.001122,0.000378,1e-05,0.000157,0.000616,0.000633,0.000557,8.9e-05
3,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.00079,-0.000431,-0.000114,0.000305,0.001168,-0.000777,0.000465,-0.000518,-0.000969,0.000598
4,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000563,0.000536,0.000407,0.001204,0.00066,0.00065,-0.000429,0.000567,0.001023,-0.000956
5,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.000263,-0.00013,0.001121,0.000421,-0.000161,-0.000862,0.000777,-0.000355,0.000746,-0.00032
6,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.00043,-0.000413,0.000511,-0.001177,7.8e-05,0.00124,-0.00089,0.000228,-0.000249,-0.000838
7,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.000187,0.000384,-0.000563,-0.000878,-0.000342,-0.001157,-0.001088,0.00015,-0.000759,-0.000853
8,True,True,1.666667,0.222222,1.0,2.0,2.0,5.0,-1.5,-0.707107,...,0.001208,0.000715,0.000309,-0.000837,-0.000835,0.000367,0.000449,-0.000909,0.001057,-0.000681
9,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000261,-0.000801,-0.001031,0.000718,0.001085,-0.001109,-0.000121,-0.000304,-0.000248,-0.00095


In [157]:
#X_orig = X_test

In [161]:
X_orig.equals(X_test)

True

In [174]:
import time

filename = "htest_{timestr}.csv".format(timestr = time.strftime("%Y%m%d-%H%M%S"))

X_test.to_csv(filename, index=False)