In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

'13'

# Load training set and train Doc2Vec

In [7]:
import multiprocessing as mp
import sys

from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pyarrow.parquet import ParquetFile

from sherlock import helpers
from sherlock.features.paragraph_vectors import (
    initialise_nltk,
    tagcol_paragraph_embeddings_features,
    train_paragraph_embeddings_features
)
from sherlock.features.preprocessing import convert_string_lists_to_lists
from sherlock.functional import extract_features_to_csv

In [4]:
print(f'Started at {datetime.now()}')

Started at 2022-02-09 18:32:07.924777


## Download and read in raw data

You can skip this step if you want to use a preprocessed data file.

In [6]:
helpers.download_data()

Downloading the raw and preprocessed data into ../data/data.zip.
Downloading data directory.
Downloading 1-g0zbKFAXz7zKZc0Dnh74uDBpZCv4YqU into ../data/data.zip... 
3.6 GiB iB                                                                                                                                                                            Done.
Unzipping...Done.
Data was downloaded.


In [9]:
train_samples = pd.read_parquet('../data/data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/data/raw/train_labels.parquet')

In [10]:
train_samples_converted, y_train = convert_string_lists_to_lists(train_samples, train_labels, "values", "type")

100%|██████████| 412059/412059 [00:57<00:00, 7108.44it/s] 


## Train Doc2Vec

In [11]:
initialise_nltk()

Initialised NLTK, process took 0:00:00.343083 seconds.


[nltk_data] Downloading package punkt to /Users/madelon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/madelon/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
samples = train_samples_converted.dropna()
print(f'Samples: {type(samples)}, length={len(samples)}')

labels = train_labels.values.flatten()
print(f'Labels:  {type(labels)}, length={len(labels)}')

Samples: <class 'pandas.core.series.Series'>, length=412059
Labels:  <class 'numpy.ndarray'>, length=412059


In [13]:
start = datetime.now()

print('Tagging columns')
cols = tagcol_paragraph_embeddings_features(samples, labels)

print(f'Tagged Columns Doc2Vec Model, process took {datetime.now() - start} seconds.')

Tagging columns
Tagged Columns Doc2Vec Model, process took 0:03:44.321182 seconds.


In [14]:
start = datetime.now()

vec_dim = 400
print(f'Training Doc2Vec model in {vec_dim} dimensions')

train_paragraph_embeddings_features(cols, vec_dim)

print(f'Trained Doc2Vec Model, {vec_dim} dim, process took {datetime.now() - start} seconds.')

Training Doc2Vec model in 400 dimensions
Trained Doc2Vec Model, 400 dim, process took 0:22:22.303905 seconds.


In [15]:
print(f'Finished at {datetime.now()}')

Finished at 2022-02-09 19:38:41.865764
