# Using Sherlock out-of-the-box
This notebook shows how to predict a semantic type for a given table column.
The steps are basically:
- Download files for word embedding and paragraph vector feature extraction (downloads only once) and initialize feature extraction models.
- Extract features from table columns.
- Initialize Sherlock.
- Make a prediction for the feature representation of the column.

In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa

from sherlock import helpers
from sherlock.deploy.model import SherlockModel
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

In [2]:
%env PYTHONHASHSEED

UsageError: Environment does not have key: PYTHONHASHSEED


## Initialize feature extraction models

In [2]:
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:04.853532 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:03.666802 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.176746 seconds.


[nltk_data] Downloading package punkt to /Users/mmargaret/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mmargaret/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Extract features

In [52]:
# import os
# os.chdir('/Users/mmargaret/Documents/[UVA] Thesis/sherlock-project/')
filename = '8b73c4243284ca0c84c45401f2e5a008d65f291466f4661483b432741c0a1836.text.csv'
a_doc = pd.read_csv('/Users/mmargaret/Documents/[UVA] Thesis/sherlock-project/data/' + filename)

a_doc = a_doc.select_dtypes(include=[object]).astype(str)#
a_doc.columns


Index(['Company Name', 'DBA Name', 'Address', 'Address 2', 'City', 'State',
       'Zip Plus 4', 'Phone', 'Product Category Name'],
      dtype='object')

In [43]:
data = pd.Series(
    a_doc.transpose().values.tolist(),
    name="values"
)
data

0     [areasymbol, KS129, KS129, KS129, KS129, KS129...
1     [CoFIPS, 129, 129, 129, 129, 129, 129, 129, 12...
2     [mukey, 1382580, 1382580, 1382580, 1382580, 13...
3     [musym, 1510, 1510, 1510, 1510, 5110, 5110, 51...
4     [muname, Atchison clay loam, 3 to 6 percent sl...
5     [muacres, 15733, 15733, 15733, 15733, 62, 62, ...
6     [comppct_r, 90, 90, 90, 90, 70, 70, 70, 70, 85...
7     [tfact, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
8     [nirrcapcl, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
9     [nirrcapscl, e, e, e, e, e, e, e, e, e, e, e, ...
10    [irrcapcl, nan, nan, nan, nan, nan, nan, nan, ...
11    [irrcapscl, nan, nan, nan, nan, nan, nan, nan,...
12    [farmlndcl, Farmland of statewide importance, ...
13    [awc_r, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0....
14    [texdesc, loam, loam, loam, loam, loam, loam, ...
15    [texture, L, L, L, L, L, L, L, L, L, L, L, L, ...
16    [rotation, CG, CG, CG, CG, CG, CG, CG, CG, CG,...
17    [yield1, 154, 154, 154, 154, 154, 154, 154

In [44]:
extract_features(
    "../temporary.csv",
    data
)
feature_vectors = pd.read_csv("../temporary.csv", dtype=np.float32)

Extracting Features:   5%|█▏                     | 2/37 [00:00<00:02, 13.92it/s]

Exporting 1588 column features


Extracting Features: 100%|██████████████████████| 37/37 [00:02<00:00, 13.23it/s]


In [45]:
feature_vectors

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,1.0,0.0,0.753,0.247991,0.0,2.0,1.0,753.0,-0.286022,-0.390048,...,0.000657,-0.001067,-0.000278,-0.000963,-0.000641,0.000541,-0.000791,0.000574,0.001133,-0.000797
1,1.0,0.0,0.107,0.095551,0.0,1.0,0.0,107.0,4.465615,2.542758,...,0.434903,-1.02234,0.059404,0.029287,-0.591621,-0.638193,-0.325639,-0.118138,-0.271845,-0.867509
2,1.0,0.0,0.542,0.444236,0.0,3.0,0.0,542.0,0.845878,1.062023,...,-0.298329,-0.220422,0.24014,0.121958,-0.184693,-0.187081,-0.106106,0.032356,0.021296,-0.482763
3,1.0,0.0,0.108,0.100336,0.0,2.0,0.0,108.0,6.097766,2.713191,...,0.118439,-0.080154,0.346148,0.881143,0.119823,0.155681,-0.292617,0.792604,-0.129798,-0.537706
4,1.0,0.0,0.48,0.3616,0.0,2.0,0.0,480.0,-0.274219,0.849442,...,-0.215919,0.804344,-0.269608,0.396292,-0.905756,-0.491269,1.04861,0.360008,0.734587,-1.08117
5,1.0,0.0,0.338,0.301756,0.0,3.0,0.0,338.0,1.529376,1.444273,...,-0.553052,-0.208752,-0.304484,-0.262684,-0.698817,-0.459331,-0.00836,0.678788,-0.07962,-1.22858
6,1.0,0.0,0.659,0.480719,0.0,2.0,1.0,659.0,-0.802462,0.571337,...,-0.358707,0.132286,-0.537368,0.446345,-0.56991,-0.031369,-0.467609,-0.361079,-0.131662,-0.871796
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.001093,-0.00019,-0.000603,0.000873,-0.001167,0.001147,-0.000362,-0.000358,-0.001209,-0.000787
8,1.0,0.0,0.469,0.249039,0.0,1.0,0.0,469.0,-1.984565,0.124239,...,-0.412535,-0.062195,-0.606487,0.052042,-0.188003,0.207125,-0.172715,-0.543046,0.18222,-0.744869
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.032445,0.007965,-0.0284,-0.035175,-0.051668,-0.029826,-0.000234,-0.042962,0.029952,-0.047564


## Initialize Sherlock

In [46]:
model = SherlockModel();
model.initialize_model_from_json(with_weights=True, model_id="sherlock");

## Predict semantic type for column

In [47]:
predicted_labels = model.predict(feature_vectors, "sherlock")

In [48]:
predicted_labels #predicted columns' name

array(['address', 'rank', 'address', 'symbol', 'location', 'rank', 'age',
       'day', 'age', 'name', 'weight', 'region', 'description', 'weight',
       'classification', 'class', 'organisation', 'age', 'status', 'age',
       'state', 'sales', 'weight', 'duration', 'weight', 'weight',
       'sales', 'state', 'class', 'category', 'sex', 'origin', 'person',
       'state', 'person', 'region', 'region'], dtype=object)

In [49]:
a_doc.columns #original columns' name

Index(['CMZ 15 2011 Yields', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3',
       'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8',
       'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12',
       'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16',
       'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20',
       'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24',
       'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28',
       'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32',
       'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36'],
      dtype='object')

In [11]:
data.iloc[5]

['nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
