# Hylas (TYPE 1) - custom prediction using Sherlock methods

In [1]:
%load_ext autoreload
%autoreload 2

## Generate data sets from extracted CSV product column data

In [2]:
import os
import re
from ast import literal_eval
import pandas
import pyarrow.parquet as pq
import pyarrow.csv as pc
from datetime import datetime
import multiprocessing


def tryeval(val):
  try:
    val=literal_eval(val)
  except (ValueError, SyntaxError):
    pass
  return val


# Declare pool AFTER methods that will be later called from pool.
pool = multiprocessing.Pool(os.cpu_count())


pattern = re.compile("^(article|product)_")

by_key = '/Users/lowecg/mapped/attributes/brand-to-canonical/9/by-key'

print(f'Treating file data with {os.cpu_count()} cores')


start = datetime.now()

df_labels = pandas.DataFrame(columns=['type'])
df_samples = pandas.DataFrame(columns=['values'])

idx = 0

for file_name in os.listdir(by_key):
  if pattern.match(file_name):
    print(f'Processing "{file_name}"')
    
    with open (by_key + '/' + file_name, 'r') as attributes_file:
      data=attributes_file.readlines()
        
      idx += 1      
        
      df_labels.loc[idx, 'type'] = file_name

      to_store = str(list(pool.map(tryeval, map(str.strip, data))))
    
      df_samples.loc[idx, 'values'] = to_store
        
  else:
    print('IGNORED: ', file_name)

    
print('Saving to parquet')
    
df_labels.to_parquet(fname='myfile_labels.parquet',engine='auto',compression='snappy')
df_samples.to_parquet(fname='myfile_values.parquet',engine='auto',compression='snappy')

end = datetime.now()
x = end - start
print(f'Write process took {x} seconds.')

Treating file data with 8 cores
Processing "product_brandGender2"
Processing "product_ageGroup1"
Processing "product_sportPurpose3"
Processing "product_sportPurpose4"
Processing "product_sportPurpose2"
Processing "product_brandCategory4_footwear"
Processing "article_brandColourName"
Processing "product_brandCategory1"
Processing "article_id"
Processing "product_brandGender1"
Processing "product_type"
Processing "article_brandSize"
Processing "product_ageGroup2"
Processing "product_brandSeason"
Processing "product_brandSizeGrid"
Processing "product_sportPurpose1"
Processing "product_primaryCategoryIdCode"
Processing "product_id"
Processing "product_brandCategory3"
Processing "product_brandCategory2"
Saving to parquet
Write process took 0:00:10.109004 seconds.


In [3]:
import pandas as pd

print('Creating dataframe task (myfile_samples): ', datetime.now())

myfile_labels = pd.read_parquet('myfile_labels.parquet')
myfile_samples = pd.read_parquet('myfile_values.parquet')

#print('Starting task (myfile_samples): ', datetime.now())

# this operation is now handled by convert_string_lists_to_lists in a later step
#myfile_samples = myfile_samples['values'].apply(literal_eval)

#print('Finished task (myfile_samples): ', datetime.now())

Creating dataframe task (myfile_samples):  2021-02-09 12:05:44.696978


In [4]:
myfile_labels.head(50)

Unnamed: 0,type
1,product_brandGender2
2,product_ageGroup1
3,product_sportPurpose3
4,product_sportPurpose4
5,product_sportPurpose2
6,product_brandCategory4_footwear
7,article_brandColourName
8,product_brandCategory1
9,article_id
10,product_brandGender1


In [5]:
myfile_samples.head(20)

Unnamed: 0,values
1,"['KU', 'Girls', 'Kids Unisex', 'Boys', 'Boys',..."
2,"['M', 'K', 'W', 'Women', 'W', 'W', 'Women', 'W..."
3,"['Yoga', 'Yoga', 'Skateboarding', 'Training', ..."
4,"['Yoga', 'Winter Sports', 'Studio', 'Tennis', ..."
5,"['Running', 'Training', 'Yoga', 'Training', 'Y..."
6,"['Performance', 'Performance', 'Performance', ..."
7,"['White', 'White', 'White', 'White', 'White', ..."
8,"['Clothing', 'Shoes', 'Clothing', 'Clothing', ..."
9,"[4056562624693, 4056562625324, 4056562625379, ..."
10,"['M', 'M', 'K', 'Men', 'Men', 'M', 'M', 'Men',..."


In [6]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

from sherlock import helpers
#from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.functional import extract_features_to_csv
from sherlock.deploy.predict_sherlock import predict_sherlock

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

### Given that feature extraction can take long, we only take the first 100 samples.

In [7]:
#y_test_subset = y_test[:100]

In [8]:
%load_ext line_profiler

In [9]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:06.461108 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:02.776021 seconds. (filename = ../sherlock/features/par_vec_retrained_400.pkl)
Initialised NLTK, process took 0:00:00.140686 seconds.


[nltk_data] Downloading package punkt to /Users/lowecg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lowecg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
start = datetime.now()

print(f'processing {myfile_samples["values"]}')

#%lprun -m sherlock.features.preprocessing extract_features_to_csv('output2.csv', myfile_samples.head(n=100))
extract_features_to_csv('output2.csv', myfile_samples["values"].head(n=100))


#X_test = extract_features('output.csv', test_samples_converted.head(n=100))

end = datetime.now()
x = end - start
print(f'Extract Features process took {x} seconds.')

processing 1     ['KU', 'Girls', 'Kids Unisex', 'Boys', 'Boys',...
2     ['M', 'K', 'W', 'Women', 'W', 'W', 'Women', 'W...
3     ['Yoga', 'Yoga', 'Skateboarding', 'Training', ...
4     ['Yoga', 'Winter Sports', 'Studio', 'Tennis', ...
5     ['Running', 'Training', 'Yoga', 'Training', 'Y...
6     ['Performance', 'Performance', 'Performance', ...
7     ['White', 'White', 'White', 'White', 'White', ...
8     ['Clothing', 'Shoes', 'Clothing', 'Clothing', ...
9     [4056562624693, 4056562625324, 4056562625379, ...
10    ['M', 'M', 'K', 'Men', 'Men', 'M', 'M', 'Men',...
11    ['product-canonical', 'product-canonical', 'pr...
12    ['XS', 'S', 'M', 'L', 'XL', '2XL', '3XL', '4XL...
13    ['KU', 'Girls', 'Kids Unisex', 'Boys', 'Boys',...
14    ['SS16', 'FW17', 'SS20', 'FW16', 'FW18', 'FW17...
15    ['size-m_bottoms', 'size-shoes', 'size-m_tops'...
16    ['Lifestyle', 'Training', 'Golf', 'Lifestyle',...
17    ['Clothing', 'Clothing', 'Clothing', 'Shoes', ...
18    ['S93310', 'BA7934', 'EK1320', 

In [11]:
# Baseline (before changes)
# Extract Features process took 0:03:11.954844 seconds.

# Tuning iterations
# Extract Features process took 0:00:13.869183 seconds. (cache Word Embeddings)
# Extract Features process took 0:00:06.143361 seconds. (cache Doc2Vec)
# Extract Features process took 0:00:01.308678 seconds. (improved computation for bag of character features)
# Extract Features process took 0:00:01.259591 seconds. (smaller optimisation tweaks - string cat, removal of double compute of some stats)
# Extract Features process took 0:00:00.650005 seconds. (use arrays not pd.Series for stats, series.str.count is also inefficient compared to loops)
# Extract Features process took 0:00:00.320845 seconds. (replace np stats calcs, unique values calc)

In [12]:
X_test = pd.read_csv('output2.csv', dtype=np.float32)

print(X_test)

    n_[0]-agg-any  n_[0]-agg-all  n_[0]-agg-mean  n_[0]-agg-var  \
0             0.0            0.0           0.000       0.000000   
1             0.0            0.0           0.000       0.000000   
2             0.0            0.0           0.000       0.000000   
3             0.0            0.0           0.000       0.000000   
4             0.0            0.0           0.000       0.000000   
5             0.0            0.0           0.000       0.000000   
6             0.0            0.0           0.000       0.000000   
7             0.0            0.0           0.000       0.000000   
8             1.0            0.0           2.133       0.871311   
9             0.0            0.0           0.000       0.000000   
10            0.0            0.0           0.000       0.000000   
11            1.0            0.0           0.079       0.072759   
12            0.0            0.0           0.000       0.000000   
13            1.0            0.0           0.155       0.13097

In [13]:
import pandas as pd
import numpy as np

from sherlock.deploy.predict_sherlock import predict_sherlock
from datetime import datetime
from sklearn.metrics import f1_score, classification_report

In [14]:
predicted_labels = predict_sherlock(X_test, nn_id='retrained_sherlock10')

W0209 12:06:03.123449 4366253568 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0209 12:06:03.124521 4366253568 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0209 12:06:03.128082 4366253568 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project/venv/lib/python3.7/site-pa

In [15]:
print(predicted_labels)

['gender' 'gender' 'class' 'class' 'class' 'brand' 'company' 'type'
 'address' 'gender' 'address' 'age' 'gender' 'address' 'country'
 'description' 'type' 'address' 'product' 'description']


In [16]:
labels={}

labels['product_brandGender2'] = 'gender'
labels['product_ageGroup1'] = 'gender'
labels['product_sportPurpose3'] = 'description'
labels['product_sportPurpose4'] = 'description'
labels['product_sportPurpose2'] = 'description'
labels['product_brandCategory4_footwear'] = 'description'
labels['article_brandColourName'] = 'colour'
labels['product_brandCategory1'] = 'category'
labels['article_id'] = 'code'
labels['product_brandGender1'] = 'gender'
labels['product_type'] = 'code'
labels['article_brandSize'] = 'code'
labels['product_ageGroup2'] = 'gender'
labels['product_brandSeason'] = 'code'
labels['product_brandSizeGrid'] = 'code'
labels['product_sportPurpose1'] = 'description'
labels['product_primaryCategoryIdCode'] = 'code'
labels['product_id'] = 'code'
labels['product_brandCategory3'] = 'category'
labels['product_brandCategory2'] = 'category'

y_test=list(labels.values())


In [17]:
f1_score(y_test, predicted_labels, average="weighted")

0.27142857142857146