# Hylas - custom prediction using Sherlock methods

In [1]:
%load_ext autoreload
%autoreload 2

## Generate data sets from extracted CSV product column data

In [2]:
import os
import re
from ast import literal_eval
import pandas
import pyarrow.parquet as pq
import pyarrow.csv as pc
from datetime import datetime
import multiprocessing


def tryeval(val):
  try:
    val=literal_eval(val)
  except (ValueError, SyntaxError):
    pass
  return val


# Declare pool AFTER methods that will be later called from pool.
pool = multiprocessing.Pool(os.cpu_count())


pattern = re.compile("^(article|product)_")

by_key = '/Users/lowecg/mapped/attributes/brand-to-canonical/9/by-key'

print(f'Treating file data with {os.cpu_count()} cores')


start = datetime.now()

df_labels = pandas.DataFrame(columns=['type'])
df_samples = pandas.DataFrame(columns=['values'])

idx = 0

for file_name in os.listdir(by_key):
  if pattern.match(file_name):
    print(f'Processing "{file_name}"')
    
    with open (by_key + '/' + file_name, 'r') as attributes_file:
      data=attributes_file.readlines()
        
      idx += 1      
        
      df_labels.loc[idx, 'type'] = file_name
    
      unique = list(pool.map(str.strip, set(data)))
        
      to_store = str(list(pool.map(tryeval, unique)))
    
      df_samples.loc[idx, 'values'] = to_store
        
  else:
    print('IGNORED: ', file_name)

    
print('Saving to parquet')
    
df_labels.to_parquet(fname='myfile_labels.parquet',engine='auto',compression='snappy')
df_samples.to_parquet(fname='myfile_values.parquet',engine='auto',compression='snappy')

end = datetime.now()
x = end - start
print(f'Write process took {x} seconds.')

Treating file data with 8 cores
Processing "product_brandGender2"
Processing "product_ageGroup1"
Processing "product_sportPurpose3"
Processing "product_sportPurpose4"
Processing "product_sportPurpose2"
Processing "product_brandCategory4_footwear"
Processing "article_brandColourName"
Processing "product_brandCategory1"
Processing "article_id"
Processing "product_brandGender1"
Processing "product_type"
Processing "article_brandSize"
Processing "product_ageGroup2"
Processing "product_brandSeason"
Processing "product_brandSizeGrid"
Processing "product_sportPurpose1"
Processing "product_primaryCategoryIdCode"
Processing "product_id"
Processing "product_brandCategory3"
Processing "product_brandCategory2"
Saving to parquet
Write process took 0:00:03.768062 seconds.


In [3]:
import pandas as pd

print('Creating dataframe task (myfile_samples): ', datetime.now())

myfile_labels = pd.read_parquet('myfile_labels.parquet')
myfile_samples = pd.read_parquet('myfile_values.parquet')

#print('Starting task (myfile_samples): ', datetime.now())

# this operation is now handled by convert_string_lists_to_lists in a later step
#myfile_samples = myfile_samples['values'].apply(literal_eval)

#print('Finished task (myfile_samples): ', datetime.now())

Creating dataframe task (myfile_samples):  2021-01-01 15:44:04.174916


In [4]:
myfile_labels.head(50)

Unnamed: 0,type
1,product_brandGender2
2,product_ageGroup1
3,product_sportPurpose3
4,product_sportPurpose4
5,product_sportPurpose2
6,product_brandCategory4_footwear
7,article_brandColourName
8,product_brandCategory1
9,article_id
10,product_brandGender1


In [5]:
myfile_samples.head(20)

Unnamed: 0,values
1,"['KU', 'Infants', 'Boys', 'Girls', 'Kids Unisex']"
2,"['W', 'U', 'Unisex', 'Women', 'Men', 'M', 'Kid..."
3,"['Netball', 'Volleyball', 'Skateboarding', 'Wa..."
4,"['Winter Sports', 'HIIT', 'Training', 'Handbal..."
5,"['Volleyball', 'Skateboarding', 'Running', 'Wa..."
6,"['Y-3', 'adidas neo', 'Classics', 'adidas TERR..."
7,"['Dgh Solid Grey/Gun Metallic', 'Scarlet / Whi..."
8,"['Clothing', 'Accessories', 'Shoes']"
9,"[4059812314870, 4055014279801, 4054706518259, ..."
10,"['W', 'U', 'Unisex', 'Women', 'Men', 'M', 'K',..."


In [6]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [7]:
test_samples_converted, y_test = convert_string_lists_to_lists(myfile_samples, myfile_labels, "values", "type")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [8]:
test_samples_converted.head()

1              [KU, Infants, Boys, Girls, Kids Unisex]
2               [W, U, Unisex, Women, Men, M, Kids, K]
3    [Netball, Volleyball, Skateboarding, Walking, ...
4    [Winter Sports, HIIT, Training, Handball, Urba...
5    [Volleyball, Skateboarding, Running, Walking, ...
Name: values, dtype: object

### Given that feature extraction can take long, we only take the first 100 samples.

In [9]:
y_test_subset = y_test[:100]

In [10]:
%load_ext line_profiler

In [11]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:05.834454 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:02.292727 seconds. (filename = ../sherlock/features/par_vec_retrained_400.pkl)
Initialised NLTK, process took 0:00:00.158073 seconds.


[nltk_data] Downloading package punkt to /Users/lowecg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lowecg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
start = datetime.now()

%lprun -m sherlock.features.preprocessing X_test=extract_features('output2.csv', test_samples_converted.head(n=100)) 



#X_test = extract_features('output.csv', test_samples_converted.head(n=100))

end = datetime.now()
x = end - start
print(f'Extract Features process took {x} seconds.')

Extracting Features:  75%|███████▌  | 15/20 [00:00<00:00, 60.66it/s]

Exporting 1578 column features


Extracting Features: 100%|██████████| 20/20 [00:00<00:00, 74.93it/s]

Extract Features process took 0:00:00.320845 seconds.





In [13]:
# Baseline (before changes)
# Extract Features process took 0:03:11.954844 seconds.

# Tuning iterations
# Extract Features process took 0:00:13.869183 seconds. (cache Word Embeddings)
# Extract Features process took 0:00:06.143361 seconds. (cache Doc2Vec)
# Extract Features process took 0:00:01.308678 seconds. (improved computation for bag of character features)
# Extract Features process took 0:00:01.259591 seconds. (smaller optimisation tweaks - string cat, removal of double compute of some stats)
# Extract Features process took 0:00:00.650005 seconds. (use arrays not pd.Series for stats, series.str.count is also inefficient compared to loops)
# Extract Features process took 0:00:00.320845 seconds. (replace np stats calcs, unique values calc)

In [14]:
X_test.head(20)

AttributeError: 'NoneType' object has no attribute 'head'

In [None]:
#X_orig = X_test

In [None]:
X_orig.equals(X_test)

In [None]:
import time

filename = "htest_{timestr}.csv".format(timestr = time.strftime("%Y%m%d-%H%M%S"))

X_test.to_csv(filename, index=False)