# Using Sherlock out-of-the-box
This notebook shows how to predict a semantic type for a given table column.
The steps are basically:
- Download files for word embedding and paragraph vector feature extraction (downloads only once) and initialize feature extraction models.
- Extract features from table columns.
- Initialize Sherlock.
- Make a prediction for the feature representation of the column.

In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa

from sherlock import helpers
from sherlock.deploy.model import SherlockModel
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

In [2]:
# %env PYTHONHASHSEED

## Initialize feature extraction models

In [3]:
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:04.735138 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:02.510114 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.140245 seconds.


[nltk_data] Downloading package punkt to /Users/mmargaret/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mmargaret/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Extract features

In [4]:
semantics_df = pd.DataFrame(columns=['id', 'semantics'])
semantics_dict = {} # {'id':,'semantics':}

In [5]:
import os
_ = os.listdir('/Users/mmargaret/Documents/[UVA] Thesis/sherlock-project/data/data_search_e_data_csv/')

In [6]:
file_list = [id for id in _ if '.csv' in id]
len(file_list)

2919

In [7]:
error_list=[]

def extractIDSemanticsWithColumnNames(filename):
    
    IDSemanticsColumns = {'data_filename':filename, 'colSemantics': [], 'colNames':[]}
    try:
        # read files
        a_doc = pd.read_csv('/Users/mmargaret/Documents/[UVA] Thesis/sherlock-project/data/data_search_e_data_csv/' 
                            + filename, engine='python')
        a_doc = a_doc.astype(str) #only non-numeric object to str (sherlock required) = .select_dtypes(include=[object])
        data = pd.Series(a_doc.transpose().values.tolist(),name="values") #format it to list of values by columns

        # sherlock extract features
        extract_features("../temporary.csv",data)
        feature_vectors = pd.read_csv("../temporary.csv", dtype=np.float32)

        # sherlock init and predict with pre-trained model
        model = SherlockModel();
        model.initialize_model_from_json(with_weights=True, model_id="sherlock");
        predicted_labels = model.predict(feature_vectors, "sherlock")

        # f_name = filename.split('.', 1)[0] #extract id from filename

        # return dictionary with id: id of the doc, list of the columns' semantics, list of the columns' names
        IDSemanticsColumns = {'data_filename':filename, 'colSemantics': list(predicted_labels), 'colNames':list(a_doc.columns)}
    
    except:
        print('Unable to extract: {}'.format(filename))
        global error_list
        error_list += [filename]
        
    return IDSemanticsColumns

In [8]:
# TEST function
print(file_list[0])
print(extractIDSemanticsWithColumnNames(file_list[0]))
error_list


d2d297eb8b86aa49d40226d4efdb1729515655656dece7bd80b01c6ddabb2b83.text.csv
Unable to extract: d2d297eb8b86aa49d40226d4efdb1729515655656dece7bd80b01c6ddabb2b83.text.csv
{'data_filename': 'd2d297eb8b86aa49d40226d4efdb1729515655656dece7bd80b01c6ddabb2b83.text.csv', 'colSemantics': [], 'colNames': []}


['d2d297eb8b86aa49d40226d4efdb1729515655656dece7bd80b01c6ddabb2b83.text.csv']

In [None]:
enrich_list = []

for i in range(0, len(file_list)):
    
    enrich_list += [extractIDSemanticsWithColumnNames(file_list[i])]
    if i%25==0:
        pd.DataFrame(enrich_list, 
             columns=['data_filename', 'colSemantics', 'colNames']).to_csv('/Users/mmargaret/Documents/[UVA] Thesis/sherlock-project/output/' 
                                                                            +'enriched_part.csv', index=False)
error_list

Extracting Features:   0%|                               | 0/25 [00:00<?, ?it/s]

Unable to extract: d2d297eb8b86aa49d40226d4efdb1729515655656dece7bd80b01c6ddabb2b83.text.csv


Extracting Features:  12%|██▊                    | 3/25 [00:00<00:04,  4.53it/s]

Exporting 1588 column features


Extracting Features: 100%|██████████████████████| 25/25 [00:01<00:00, 15.83it/s]
W0531 19:43:19.632974 4553154048 deprecation.py:506] From /Users/mmargaret/opt/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0531 19:43:19.634391 4553154048 deprecation.py:506] From /Users/mmargaret/opt/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0531 19:43:19.637321 4553154048 deprecation.py:506] From /Users/mmargaret/opt/anaconda3/e

Unable to extract: 30582267f36c39a6ff33b0e38787eaa72f9ad84192498830816d3d2bf5b2e73b.text.csv
Exporting 1588 column features



Extracting Features: 100%|███████████████████████| 6/6 [00:00<00:00, 357.14it/s]

Unable to extract: 70abcec885ec57461c9197faad068e8c9d8b838e979b1d0e7c41aed6585145d4.text.csv
Unable to extract: 29c42fa751d277febbd914e65741513f63166637f9f18f824f685b4b2183dd21.text.csv
Exporting 1588 column features



Extracting Features:   0%|                                | 0/9 [00:00<?, ?it/s]

Unable to extract: 7230b684e09f6cf9bbf14a0d3c516220fdf10cce42133e52b84d4917b6c90722.text.csv


Extracting Features:  11%|██▋                     | 1/9 [00:00<00:03,  2.38it/s]

Exporting 1588 column features


Extracting Features: 100%|████████████████████████| 9/9 [00:02<00:00,  3.74it/s]
Extracting Features: 100%|███████████████████████| 6/6 [00:00<00:00, 125.52it/s]

Unable to extract: 22c965441bffe87f7764917d54bb9f75c11597e941e5acc7bd528f550b8dfdfe.text.csv
Exporting 1588 column features



Extracting Features:   0%|                               | 0/69 [00:00<?, ?it/s]

Unable to extract: 2bb5fbd450b4ab1a98faf40c63db6b40173fbdc7fd5d5186b4e9a33c257ed47f.text.csv


Extracting Features:   6%|█▎                     | 4/69 [00:00<00:13,  4.95it/s]

Exporting 1588 column features


Extracting Features:   9%|██                     | 6/69 [00:00<00:06,  9.68it/s]
Extracting Features: 100%|█████████████████████| 14/14 [00:00<00:00, 363.37it/s]


Unable to extract: 442a711c91559f460c4e073aee07cb410da7fb23c992a3115b8f4024d217887c.text.csv
Exporting 1588 column features


Extracting Features: 100%|███████████████████████| 7/7 [00:00<00:00, 213.11it/s]

Unable to extract: a96c688fdcf075de565d3b14cb53f63dddc87b040adc4dcbcb95154d7eba2130.text.csv
Exporting 1588 column features



Extracting Features: 100%|█████████████████████| 33/33 [00:00<00:00, 209.94it/s]

Unable to extract: c2f12902a5a503da355be2f9945e43593d7d32ef8377f2282d872131688430e0.text.csv
Exporting 1588 column features





Unable to extract: d5f4bce119b1659e0945cae61941030e4b74ab013a48be43bd3db00ae365b00b.text.csv


Extracting Features:   7%|█▋                     | 1/14 [00:00<00:03,  3.68it/s]

Exporting 1588 column features


Extracting Features:  29%|██████▌                | 4/14 [00:00<00:01,  6.26it/s]
Extracting Features:   0%|                                | 0/9 [00:00<?, ?it/s]

Unable to extract: d36f2ae13705ee30fb25ac9c8d3a6dae6166eefe16535e2b0cc3d829822047a4.text.csv
Exporting 1588 column features


Extracting Features:  56%|█████████████▎          | 5/9 [00:00<00:00, 10.81it/s]
Extracting Features:   0%|                                | 0/4 [00:00<?, ?it/s]


Unable to extract: 0993f49c9f27e3a917c9832f17e481b404a8375b753c14d9d203f05d3251e172.text.csv
Unable to extract: ac8c5934ebfc1e76cce230bbfaa5699b26bdcea174e903d9124ac31f372984ee.text.csv


Extracting Features: 100%|█████████████████████| 14/14 [00:00<00:00, 369.88it/s]
Extracting Features:   0%|                               | 0/12 [00:00<?, ?it/s]

Exporting 1588 column features
Unable to extract: 7a243b2d586bb94b73d5f3f24ea816f29aeb9d2280432593cb8ac8fec390a5ae.text.csv


Extracting Features:   0%|                               | 0/12 [00:00<?, ?it/s]
Extracting Features:   0%|                               | 0/35 [00:00<?, ?it/s]


Unable to extract: 0c816e0b45afc00a4f66feef9442171e811635c7fed1c4d2c284b7b0518dc11c.text.csv
Unable to extract: 4718778a55e011dc9290b87f01515fc5818f67cbb7a79d209f27c742cfd99ccc.text.csv


Extracting Features: 100%|████████████████████████| 1/1 [00:00<00:00,  7.89it/s]


Exporting 1588 column features
Unable to extract: 6ae4eafa06f6e67843b0ff09cc5b4ca62ad61d2b1dc0feb5ceb5268a04f20680.text.csv


Extracting Features: 100%|███████████████████████| 3/3 [00:00<00:00, 178.88it/s]

Unable to extract: 8f6314940d335741ff381c5eea2a50285f7576a5ae47f3d3722a774bc89ad2a7.text.csv
Exporting 1588 column features



Extracting Features: 100%|████████████████████████| 4/4 [00:00<00:00, 86.19it/s]

Unable to extract: 5d3f6b1e8b6f248c6d7a6c88c1938d64e40d46260be5ff4611ee86e235d892a8.text.csv
Exporting 1588 column features



Extracting Features: 100%|███████████████████████| 3/3 [00:00<00:00, 122.81it/s]

Unable to extract: e7e51494eb42ae9824a22678e1c3b3595e243780fcb160b4c4e2eda12874d842.text.csv
Exporting 1588 column features



Extracting Features:   0%|                               | 0/96 [00:00<?, ?it/s]

Unable to extract: fb63780cc9727b9fc7d5feb604c5d321741025f9fede1d455d489bca2eaed7b8.text.csv


Extracting Features:  20%|████▎                 | 19/96 [00:01<01:18,  1.01s/it]

Exporting 1588 column features


Extracting Features: 100%|██████████████████████| 96/96 [00:01<00:00, 49.34it/s]
Extracting Features: 100%|███████████████████████| 9/9 [00:00<00:00, 240.26it/s]

Unable to extract: 5820225f00d0233715e1149f961458563e1ef95539e1eb65cab484b79c44626f.text.csv
Exporting 1588 column features



Extracting Features: 100%|█████████████████████| 10/10 [00:00<00:00, 214.20it/s]

Unable to extract: 9af8b03063a09d97600f173c8f3f61dfed3d4c768b80595ced6cc9ed672688fb.text.csv
Exporting 1588 column features



Extracting Features:   0%|                                | 0/4 [00:00<?, ?it/s]

Unable to extract: 9b26dfe822f979ac40f1eb0900aa5b1763fe14bd984a5f0f17142f92a97328f2.text.csv


Extracting Features: 100%|████████████████████████| 4/4 [00:00<00:00, 10.91it/s]

Exporting 1588 column features





Unable to extract: e6da782dc62ae774e81c6552db328cab163adeb5a510d71d1cbed9ea046421c7.text.csv
Unable to extract: 7288270f213af2d7b4b7fc3effe7b3bafd21f1f587f64aeaeee3563228c647e8.text.csv


Extracting Features:  57%|█████████████▋          | 4/7 [00:00<00:00, 17.42it/s]

Exporting 1588 column features


Extracting Features: 100%|████████████████████████| 7/7 [00:00<00:00, 18.60it/s]
Extracting Features: 100%|███████████████████████| 2/2 [00:00<00:00, 322.59it/s]

Unable to extract: caa36bcf19b4e25b6d531c0257208d25284876eb360ac0c27110740810d3adb1.text.csv
Exporting 1588 column features



Extracting Features:   0%|                               | 0/10 [00:00<?, ?it/s]

Unable to extract: 1abc8fda3218c9789a4bc6de184f87f1007ddae93ae1e9294799095200738ddd.text.csv
Exporting 1588 column features


Extracting Features: 100%|██████████████████████| 10/10 [00:00<00:00, 15.10it/s]
Extracting Features: 100%|█████████████████████| 14/14 [00:00<00:00, 161.74it/s]

Unable to extract: 63a10b25f1c635bb7f7b53e63c4cd034f79a56fb9a628d9098a6d376f3197f3e.text.csv
Exporting 1588 column features





Unable to extract: fb1ed87ba1640875330d205ab36ca74174b49dca59cdadf6726f5209eef7f10d.text.csv


Extracting Features:  50%|████████████            | 2/4 [00:00<00:00, 17.31it/s]

Unable to extract: 23f37ac7680748dc5ad81b9eb4b927caa399cd7b81c7eeb707f11ecc7afb618a.text.csv
Exporting 1588 column features
Unable to extract: 206fc590dddc4133d8d9303550a77dee56c6d97fda35d3fb7b779f59d1c7386b.text.csv



Extracting Features: 100%|███████████████████████| 7/7 [00:00<00:00, 164.25it/s]

Unable to extract: b24e9f72b71288b13a1aed52af308ae200f862d51dd4f98cdf5defebb3f3e8f9.text.csv
Exporting 1588 column features
Unable to extract: 811df87aaeb5bd45879cc11f440a02eed01d4bd9777e5798ebb7d432e5ac622c.text.csv



Extracting Features:   0%|                               | 0/25 [00:00<?, ?it/s]

Unable to extract: 6ea32ea04581a4f5ed882bfd3a5f7978933768a56c3105c9a6cf70007f10cf4a.text.csv
Unable to extract: 82015e9c70396f9542b02c0e3535237147d9b22b295d62577cbdee4f04676e5b.text.csv



Extracting Features:  10%|██▎                    | 3/30 [00:00<00:02, 10.16it/s]

Exporting 1588 column features


Extracting Features: 100%|██████████████████████| 30/30 [00:01<00:00, 21.18it/s]
Extracting Features: 100%|███████████████████████| 7/7 [00:00<00:00, 396.83it/s]

Unable to extract: 92269957851ee13ea06fcd1866bd8e3810dec64dabb72c1b76b04a00227e5bdf.text.csv
Exporting 1588 column features





Unable to extract: 046c3561b610ab29de866f59abb248ecf1a32fc190647c5d456784bd06d36018.text.csv


Extracting Features:   0%|                               | 0/20 [00:00<?, ?it/s]

Exporting 1588 column features


Extracting Features:   5%|█▏                     | 1/20 [00:00<00:05,  3.69it/s]
Extracting Features:  25%|█████▍                | 13/53 [00:00<00:00, 86.09it/s]
Extracting Features:   0%|                                | 0/8 [00:00<?, ?it/s]

Unable to extract: 8e6bc5f33b07fb32f22124d9dcdd146ad03010e17a22bc0299eec7f9cfee64d1.text.csv
Exporting 1588 column features
Unable to extract: 9fc89f77d3e83a03444625bd36002db2fc72fcceed7c861f8f3727a0c8688683.text.csv
Exporting 1588 column features


Extracting Features: 100%|███████████████████████| 8/8 [00:00<00:00, 148.65it/s]
Extracting Features: 100%|██████████████████████| 10/10 [00:00<00:00, 54.51it/s]

Unable to extract: ef2cb21c95c405f20744a83e13bac00cccc72360a80b384c1f2f36d3efdbb7ee.text.csv
Exporting 1588 column features



Extracting Features: 100%|███████████████████████| 3/3 [00:00<00:00, 382.08it/s]

Unable to extract: a53cb95133c6dc5b80321fd5357a2cd9541a926f2e2afc491cc3ae4d817d218b.text.csv
Exporting 1588 column features



Extracting Features: 100%|███████████████████████| 2/2 [00:00<00:00, 361.59it/s]

Unable to extract: 4d0c44d79da8fad3ddb056b6cba07a95fa1f98fa7d0bc57c69fb264b7f4aa03a.text.csv
Exporting 1588 column features





Unable to extract: 491166bcbe9f3c4a759b10f2959cc83339f3b4934229375bad39c2d2a927814e.text.csv


Extracting Features:   8%|█▊                     | 3/37 [00:00<00:01, 24.96it/s]

Exporting 1588 column features


Extracting Features: 100%|██████████████████████| 37/37 [00:02<00:00, 16.60it/s]
Extracting Features:   0%|                               | 0/30 [00:00<?, ?it/s]

Unable to extract: b21ee5232a742fed0e6c86562df0de855f33a1eb91b0d423721f5680bb8b19d9.text.csv


Extracting Features:   7%|█▌                     | 2/30 [00:00<00:02, 11.38it/s]


Exporting 1588 column features
Unable to extract: b6ac31c0d03c198c7daa8749554cbd61569e59410c43d94c56dd4e5e4f827768.text.csv


Extracting Features:   5%|█▏                     | 2/37 [00:00<00:02, 14.41it/s]

Exporting 1588 column features


Extracting Features:  41%|████████▉             | 15/37 [00:01<00:02, 10.77it/s]
Extracting Features: 100%|█████████████████████| 14/14 [00:00<00:00, 387.24it/s]


Unable to extract: 028aff3fcd2f9e431b8ece923abae0cc883c84615ac56bd0865798e14f9ed67c.text.csv
Exporting 1588 column features
Unable to extract: b27db5b9a9d0705c6cd31418f15095d9f27669f431b8b18748b4daab99d3204b.text.csv


Extracting Features:  22%|█████▎                  | 2/9 [00:00<00:00, 14.09it/s]

Exporting 1588 column features


Extracting Features: 100%|████████████████████████| 9/9 [00:00<00:00, 13.42it/s]
Extracting Features:   0%|                               | 0/12 [00:00<?, ?it/s]

Unable to extract: 2d243ad457a1814de7c7c1556b7e91fac080988ee92f816f9d52deb7d6e555af.text.csv
Exporting 1588 column features


Extracting Features: 100%|██████████████████████| 12/12 [00:00<00:00, 16.19it/s]
Extracting Features: 100%|███████████████████████| 3/3 [00:00<00:00, 138.20it/s]

Unable to extract: 33a29a7736867b93e5d6b53ff13d0ecbea274f42faeeca1c40a4d3c3ca5daf50.text.csv
Unable to extract: 131996a5027d1809789f1b20749c40d158883a4d4f242e568fbc1aa9ae2deffe.text.csv
Exporting 1588 column features



Extracting Features:  14%|███▎                   | 3/21 [00:00<00:00, 21.95it/s]

Unable to extract: eb93a7285703e877412418ea5108d3bc8aebd085c16cdecde9d30ef04b513286.text.csv
Exporting 1588 column features


Extracting Features: 100%|██████████████████████| 21/21 [00:00<00:00, 24.82it/s]


In [None]:
pd.DataFrame(enrich_list, 
             columns=['data_filename', 'colSemantics', 'colNames']).to_csv('/Users/mmargaret/Documents/[UVA] Thesis/sherlock-project/output/' 
                                                                            +'enriched_all.csv', index=False)


In [None]:
error_list

# TEST in interactive mode

In [9]:
filename = '8b73c4243284ca0c84c45401f2e5a008d65f291466f4661483b432741c0a1836.text.csv'
a_doc = pd.read_csv('/Users/mmargaret/Documents/[UVA] Thesis/sherlock-project/data/' + filename)

a_doc = a_doc.select_dtypes(include=[object]).astype(str) #
a_doc.columns


  import pandas.util.testing as tm


FileNotFoundError: [Errno 2] No such file or directory: '/Users/mmargaret/Documents/[UVA] Thesis/sherlock-project/data/8b73c4243284ca0c84c45401f2e5a008d65f291466f4661483b432741c0a1836.text.csv'

In [4]:
data = pd.Series(
    a_doc.transpose().values.tolist(),
    name="values"
)
data

0    [FFSU, FFSU, FFSU, FFSU, FFSU, FFSU, FFSU, FFS...
1    [XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, X...
2    [PEDIALYTE, PEDIALYTE, ALPRAZOLAM, PRENATAL P,...
3    [4/1, 1/1, 4/1, 4/1, 4/1, 1/1, 1/1, 1/1, 4/1, ...
4    [04/01/2004, 01/01/2004, 04/01/2004, 04/01/200...
5    [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
Name: values, dtype: object

In [5]:
extract_features(
    "../temporary.csv",
    data
)
feature_vectors = pd.read_csv("../temporary.csv", dtype=np.float32)

Extracting Features:   0%|                                | 0/6 [00:00<?, ?it/s]

Exporting 1588 column features


Extracting Features: 100%|████████████████████████| 6/6 [00:00<00:00, 11.88it/s]


In [6]:
feature_vectors

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,9.5e-05,-0.00092,-0.000721,0.001115,0.000341,0.000725,-0.001038,0.001035,3e-05,-0.000311
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.775455,0.14549,-0.11959,0.208576,-0.846167,-0.376466,-0.025163,-0.637691,0.087774,-0.480676
2,1.0,0.0,0.055,0.071975,0.0,3.0,0.0,55.0,36.710018,5.642683,...,0.546457,-0.071316,0.129075,0.110144,-0.619416,0.449996,0.387853,-0.1233,0.046871,-0.534526
3,1.0,0.0,0.247,0.185991,0.0,1.0,0.0,247.0,-0.623396,1.173288,...,-0.138906,-0.13929,0.016313,0.048243,-0.651826,-0.131029,-0.396145,-0.2497,0.07434,-0.806943
4,1.0,1.0,4.0,0.0,4.0,4.0,4.0,4000.0,-3.0,0.0,...,-0.007073,0.496197,-0.430627,0.539399,0.108929,-0.435804,-0.341008,0.529492,-0.56738,-0.342077
5,1.0,0.0,0.779,0.722159,0.0,3.0,1.0,779.0,-0.049952,0.858064,...,-0.153812,0.345746,-0.562288,-0.068926,-0.093484,-0.301105,-0.2979,0.168808,-0.18655,-0.4827


## Initialize Sherlock

In [7]:
model = SherlockModel();
model.initialize_model_from_json(with_weights=True, model_id="sherlock");

W0429 09:13:51.863543 4681580032 deprecation.py:506] From /Users/mmargaret/opt/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0429 09:13:51.864383 4681580032 deprecation.py:506] From /Users/mmargaret/opt/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0429 09:13:51.867066 4681580032 deprecation.py:506] From /Users/mmargaret/opt/anaconda3/envs/py37/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: c

## Predict semantic type for column

In [8]:
predicted_labels = model.predict(feature_vectors, "sherlock")

In [9]:
predicted_labels #predicted columns' name

array(['address', 'state', 'brand', 'country', 'address', 'age'],
      dtype=object)

In [10]:
a_doc.columns #original columns' name

Index(['Utilization Type', 'State', 'Product Name', 'Quarter Begin',
       'Quarter Begin Date', 'Location'],
      dtype='object')

In [11]:
data.iloc[5]

['nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
