# Hylas (TYPE 2) - custom prediction using Sherlock methods

In [1]:
%load_ext autoreload
%autoreload 2

## Generate data sets from extracted CSV product column data

In [38]:
import os
import re
from ast import literal_eval
import pandas
import pyarrow.parquet as pq
import pyarrow.csv as pc
from datetime import datetime
import multiprocessing
from functools import partial

def tryeval(val):
  try:
    val=literal_eval(val)
  except (ValueError, SyntaxError):
    pass
  return val


def row_value_processor_one(row):
    return row


def row_value_processor_csv(row):
    first, values = row.split(',', 1)
    
    return first, values


def row_filter_one(row):
    return True


def row_filter_csv(row_tuple, filter_value):
    #print(f'"{row_tuple[0]}" == "{filter_value}"')
    
    return row_tuple[0] == filter_value


def unpack_row_one(row):
    return row


def unpack_row_csv(row_tuple):
    return row_tuple[1]


# Declare pool AFTER methods that will be later called from pool.
pool = multiprocessing.Pool(os.cpu_count())


pattern = re.compile(r"^[{]?[0-9a-fA-F]{8}-([0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12}[}]?$")

# this is actually a directory, and not a file as it might appear from this path
by_column = '/Users/lowecg/source/labs/data/xml-discover/.cache/5e71870f-b7e2-48eb-9ca0-1ee67fe59845_ipp_productinfo_eu_20201030130300.xml'

print(f'Treating file data with {os.cpu_count()} cores')


start = datetime.now()

df_labels = pandas.DataFrame(columns=['type'])
df_samples = pandas.DataFrame(columns=['values'])

idx = 0

ignored_files = []

for file_name in os.listdir(by_column):
  if pattern.match(file_name):
    with open (by_column + '/' + file_name, 'r') as attributes_file:
      data=attributes_file.readlines()
    
      label = str.strip(data[0])
        
      values = label.split(',')
    
      if len(values) > 1:
        row_processor = row_value_processor_csv
        # peek at first data row, and use the first column as a filter value
        filter_value = data[1].split(',')[0] if len(data)>0 else None
        
        #print(f'row = "{data[1]}", filter value = "{filter_value}"')
        
        row_filter = partial(row_filter_csv, filter_value=filter_value)
        unpack_row = unpack_row_csv
        label = values[1]
      else:
        row_processor = row_value_processor_one
        row_filter = row_filter_one
        unpack_row = unpack_row_one

      print(f'Processing "{file_name}" -> "{label}"')
    
      idx += 1
        
      df_labels.loc[idx, 'type'] = label
    
        
      to_store = str(list(pool.map(tryeval, map(unpack_row, filter(row_filter, map(row_processor, map(str.strip, data[1:])))))))
    
      df_samples.loc[idx, 'values'] = to_store
  else:
    ignored_files.append(file_name)
    
    
print('IGNORED: ', ignored_files)

    
print('Saving to parquet')
    
df_labels.to_parquet(fname='myfile_labels.parquet',engine='auto',compression='snappy')
df_samples.to_parquet(fname='myfile_values.parquet',engine='auto',compression='snappy')

end = datetime.now()
x = end - start
print(f'Write process took {x} seconds.')

Treating file data with 8 cores
Processing "a9f8041c-1072-4628-8081-82f931eae137" -> "/products/PRODUCT/attributes/attribute/Access Code"
Processing "bb2d07c3-c777-4525-8017-41c9ef4333b6" -> "/products/PRODUCT/sizes/size/packageheight"
Processing "9e2384d0-2031-4139-a08b-cf83a14f5b04" -> "/products/PRODUCT/attributes/attribute/Simple Color"
Processing "1cd73091-e357-4051-89e2-92e8003d2435" -> "/products/PRODUCT/prices/price/country (a)"
Processing "9647cbf9-ab8c-4cf4-a612-037fc22b5f63" -> "/products/PRODUCT/attributes/attribute/Hat Type"
Processing "e511878a-fb0a-4d07-8e26-f6f0e99b9307" -> "/products/PRODUCT/attributes/attribute/Bag Type"
Processing "44dc383c-7386-4fa0-aea8-6217b855b57e" -> "/products/PRODUCT/attributes/attribute/Gift Collections"
Processing "7a1e72e0-1116-449e-bcc2-46df87f574d5" -> "/products/PRODUCT/attributes/attribute/Store - Name Change Test"
Processing "f374185a-656d-49f1-9542-0b41e204fc1e" -> "/products/PRODUCT/attributes/attribute/Global Football Use (FCB)"
Pro

Processing "c1b58815-0c76-4a0b-8927-d79d62c99f8d" -> "/products/PRODUCT/attributes/attribute/Sport - Primary"
Processing "c6a0f980-78ff-4751-8210-b7ce426d9802" -> "/products/PRODUCT/attributes/attribute/Gender"
Processing "c11d6095-536a-4d17-8d96-4e58642b5214" -> "/products/PRODUCT/attributes/attribute/Sock Type"
Processing "1645d338-708f-4109-ad67-aa0a3dd24d97" -> "/products/PRODUCT/sizes/size/grosspackageweight"
Processing "56286015-c6e1-46a2-95c8-da8ae98d5ae6" -> "/products/PRODUCT/attributes/attribute/Running Shoe Benefit"
Processing "f5c8503c-8ce0-4f3b-9001-f38e1f3b582f" -> "/products/PRODUCT/attributes/attribute/Accessories and Equipment"
Processing "12087fca-f81a-4fc2-b66f-5c820e063408" -> "/products/PRODUCT/attributes/attribute/NIKEiD Match Style"
Processing "842b61c9-ead3-4720-bca9-9f0b42a91155" -> "/products/PRODUCT/attributes/attribute/Sets"
Processing "cb71d16e-b524-4ab3-8fc6-b36d38164e0f" -> "/products/PRODUCT/attributes/attribute/Category"
Processing "db284728-9990-433f-8

Processing "489f3faa-2176-4c1c-a467-0941b98e4f10" -> "/products/PRODUCT/attributes/attribute/Builder Version"
Processing "ab13746d-4f84-4dab-aa64-ea89d8ed48b8" -> "/products/PRODUCT/attributes/attribute/Icon"
Processing "988b4cfe-73f8-4bf4-ab2b-974a1927e58d" -> "/products/PRODUCT/attributes/attribute/Sports Bras Style"
Processing "07f8c7a8-b652-43c4-a9b4-fc19a10c1aeb" -> "/products/PRODUCT/attributes"
Processing "7a469cd2-4f5d-42dc-b696-18afb181c6aa" -> "/products/PRODUCT/attributes/attribute/JerseyiDSKU"
Processing "18c4c04f-e74c-4269-a71a-8c44313a369e" -> "/products/PRODUCT/attributes/attribute/NIKEiD Match"
Processing "a4b4e08f-03ba-4d84-9ba3-1a13880eeedb" -> "/products/PRODUCT/colornumber"
Processing "e27c0232-fdb5-4257-96b8-694638d1155e" -> "/products/PRODUCT/sizes/size/netweight"
Processing "c2621d28-635e-4dbb-91b0-7da6e10f88b3" -> "/products/PRODUCT/attributes/attribute/Tertiary Color"
Processing "9cb92d69-9a98-4f73-9fa2-a79caf2da68f" -> "/products/PRODUCT/prices/price/currentre

Write process took 0:01:05.883698 seconds.


In [39]:
import pandas as pd

print('Creating dataframe task (myfile_samples): ', datetime.now())

myfile_labels = pd.read_parquet('myfile_labels.parquet')
myfile_samples = pd.read_parquet('myfile_values.parquet')

#print('Starting task (myfile_samples): ', datetime.now())

# this operation is now handled by convert_string_lists_to_lists in a later step
#myfile_samples = myfile_samples['values'].apply(literal_eval)

#print('Finished task (myfile_samples): ', datetime.now())

Creating dataframe task (myfile_samples):  2021-02-09 17:51:49.858011


In [40]:
myfile_labels.head(50)

Unnamed: 0,type
1,/products/PRODUCT/attributes/attribute/Access ...
2,/products/PRODUCT/sizes/size/packageheight
3,/products/PRODUCT/attributes/attribute/Simple ...
4,/products/PRODUCT/prices/price/country (a)
5,/products/PRODUCT/attributes/attribute/Hat Type
6,/products/PRODUCT/attributes/attribute/Bag Type
7,/products/PRODUCT/attributes/attribute/Gift Co...
8,/products/PRODUCT/attributes/attribute/Store -...
9,/products/PRODUCT/attributes/attribute/Global ...
10,/products/PRODUCT/attributes/attribute/CMS Enh...


In [41]:
myfile_samples.head(20)

Unnamed: 0,values
1,"['true', 'true', 'true', 'true', 'true', 'true..."
2,"[7.7978, 7.7978, 7.8, 8.89, 7.8, 8.763, 7.8, 8..."
3,"['Black', 'Black', 'Black', 'Black', 'Black', ..."
4,"['AT', 'BE', 'CZ', 'DE', 'DK', 'ES', 'FI', 'FR..."
5,"['Adjustable', 'Adjustable', 'Adjustable', 'Ad..."
6,"['Backpack', 'Backpack', 'Backpack', 'Backpack..."
7,"['Just Dropped', 'Just Dropped', 'Just Dropped..."
8,"['FCBarcelona', 'FCBarcelona', 'FCBarcelona', ..."
9,"['Use - Off the pitch', 'Use - Off the pitch',..."
10,"['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Ye..."


In [None]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

from sherlock import helpers
#from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.functional import extract_features_to_csv
from sherlock.deploy.predict_sherlock import predict_sherlock

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

### Given that feature extraction can take long, we only take the first 100 samples.

In [None]:
#y_test_subset = y_test[:100]

In [None]:
%load_ext line_profiler

In [None]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

In [None]:
start = datetime.now()

print(f'processing {myfile_samples["values"]}')

#%lprun -m sherlock.features.preprocessing extract_features_to_csv('output2.csv', myfile_samples.head(n=100))
extract_features_to_csv('output2.csv', myfile_samples["values"].head(n=100))


#X_test = extract_features('output.csv', test_samples_converted.head(n=100))

end = datetime.now()
x = end - start
print(f'Extract Features process took {x} seconds.')

In [None]:
# Baseline (before changes)
# Extract Features process took 0:03:11.954844 seconds.

# Tuning iterations
# Extract Features process took 0:00:13.869183 seconds. (cache Word Embeddings)
# Extract Features process took 0:00:06.143361 seconds. (cache Doc2Vec)
# Extract Features process took 0:00:01.308678 seconds. (improved computation for bag of character features)
# Extract Features process took 0:00:01.259591 seconds. (smaller optimisation tweaks - string cat, removal of double compute of some stats)
# Extract Features process took 0:00:00.650005 seconds. (use arrays not pd.Series for stats, series.str.count is also inefficient compared to loops)
# Extract Features process took 0:00:00.320845 seconds. (replace np stats calcs, unique values calc)

In [None]:
X_test = pd.read_csv('output2.csv', dtype=np.float32)

print(X_test)

In [None]:
import pandas as pd
import numpy as np

from sherlock.deploy.predict_sherlock import predict_sherlock
from datetime import datetime
from sklearn.metrics import f1_score, classification_report

In [None]:
predicted_labels = predict_sherlock(X_test, nn_id='retrained_sherlock10')

In [None]:
print(predicted_labels)

In [None]:
labels={}

labels['product_brandGender2'] = 'gender'
labels['product_ageGroup1'] = 'gender'
labels['product_sportPurpose3'] = 'description'
labels['product_sportPurpose4'] = 'description'
labels['product_sportPurpose2'] = 'description'
labels['product_brandCategory4_footwear'] = 'description'
labels['article_brandColourName'] = 'colour'
labels['product_brandCategory1'] = 'category'
labels['article_id'] = 'code'
labels['product_brandGender1'] = 'gender'
labels['product_type'] = 'code'
labels['article_brandSize'] = 'code'
labels['product_ageGroup2'] = 'gender'
labels['product_brandSeason'] = 'code'
labels['product_brandSizeGrid'] = 'code'
labels['product_sportPurpose1'] = 'description'
labels['product_primaryCategoryIdCode'] = 'code'
labels['product_id'] = 'code'
labels['product_brandCategory3'] = 'category'
labels['product_brandCategory2'] = 'category'

y_test=list(labels.values())


In [None]:
f1_score(y_test, predicted_labels, average="weighted")