In [1]:
%load_ext autoreload
%autoreload 2

# Load training set and train Doc2Vec

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [2]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [3]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


## Read in raw data
You can skip this step if you want to use a preprocessed data file.

In [4]:
train_samples = pd.read_parquet('../data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/raw/train_labels.parquet')

In [5]:
validation_samples = pd.read_parquet('../data/raw/val_values.parquet')
validation_labels = pd.read_parquet('../data/raw/val_labels.parquet')

In [6]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')
test_labels = pd.read_parquet('../data/raw/test_labels.parquet')

## Train Doc2Vec

In [7]:
from datetime import datetime
from sherlock.features.paragraph_vectors import tagcol_paragraph_embeddings_features

start = datetime.now()

print('Tagging columns')

cols=tagcol_paragraph_embeddings_features(train_samples)

print(f'Tagged Columns Doc2Vec Model, process took {datetime.now() - start} seconds.')




Tagging columns
Tagged Columns Doc2Vec Model, process took 0:01:35.506668 seconds.


In [8]:
from datetime import datetime
from sherlock.features.paragraph_vectors import train_paragraph_embeddings_features

start = datetime.now()

vec_dim = 400
print(f'Training Doc2Vec model in {vec_dim} dimensions')

train_paragraph_embeddings_features(cols, vec_dim)

print(f'Trained Doc2Vec Model, {vec_dim} dim, process took {datetime.now() - start} seconds.')

Training Doc2Vec model in 400 dimensions
Trained Doc2Vec Model, 400 dim, process took 0:07:51.469076 seconds.
