# Embedder

## Setup

In [15]:
import torch
import os
import subprocess
import tensorflow as tf

In [16]:
torch.cuda.is_available()

True

In [17]:
if not os.path.exists("datasets/DATASET-NAME"):
    print("Starting download script")
    output = subprocess.check_output(["python", "download_datasets.py"])
    print(output.decode())

Starting download script
Downloading datasets


---
## Dataset download

In [18]:
import tensorflow_datasets as tfds

In [19]:
[i for i in tfds.list_builders() if 'wiki' in i]

['paws_wiki',
 'paws_x_wiki',
 'salient_span_wikipedia',
 'wiki40b',
 'wiki_auto',
 'wiki_bio',
 'wiki_dialog',
 'wiki_table_questions',
 'wiki_table_text',
 'wikiann',
 'wikihow',
 'wikipedia',
 'wikipedia_toxicity_subtypes']

Dataset of choice:
https://www.tensorflow.org/datasets/catalog/wiki_auto

In [20]:
builder = tfds.builder('wiki_auto')

In [21]:
# 1. Create the tfrecord files (no-op if already exists)
builder.download_and_prepare(download_dir="datasets/wiki_auto")

In [22]:
# 2. Load the `tf.data.Dataset`
dataset = builder.as_dataset()
dataset

{'dev': <_PrefetchDataset element_spec={'GLEU-score': TensorSpec(shape=(), dtype=tf.float64, name=None), 'alignment_label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'normal_sentence': TensorSpec(shape=(), dtype=tf.string, name=None), 'normal_sentence_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'simple_sentence': TensorSpec(shape=(), dtype=tf.string, name=None), 'simple_sentence_id': TensorSpec(shape=(), dtype=tf.string, name=None)}>,
 'test': <_PrefetchDataset element_spec={'GLEU-score': TensorSpec(shape=(), dtype=tf.float64, name=None), 'alignment_label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'normal_sentence': TensorSpec(shape=(), dtype=tf.string, name=None), 'normal_sentence_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'simple_sentence': TensorSpec(shape=(), dtype=tf.string, name=None), 'simple_sentence_id': TensorSpec(shape=(), dtype=tf.string, name=None)}>}

In [23]:
devset = dataset['dev']
testset = dataset['test']
devset

<_PrefetchDataset element_spec={'GLEU-score': TensorSpec(shape=(), dtype=tf.float64, name=None), 'alignment_label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'normal_sentence': TensorSpec(shape=(), dtype=tf.string, name=None), 'normal_sentence_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'simple_sentence': TensorSpec(shape=(), dtype=tf.string, name=None), 'simple_sentence_id': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [24]:
examples = devset.take(10)  # Take a few examples
examples

<_TakeDataset element_spec={'GLEU-score': TensorSpec(shape=(), dtype=tf.float64, name=None), 'alignment_label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'normal_sentence': TensorSpec(shape=(), dtype=tf.string, name=None), 'normal_sentence_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'simple_sentence': TensorSpec(shape=(), dtype=tf.string, name=None), 'simple_sentence_id': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [25]:
for e in examples:
    print("\t", e['normal_sentence'].numpy())

	 b'For example, in the petroleum industry, sodium hydroxide is used as an additive in drilling mud to increase alkalinity in bentonite mud systems, to increase the mud viscosity, and to neutralize any acid gas (such as hydrogen sulfide and carbon dioxide) which may be encountered in the geological formation as drilling progresses.'
	 b'Emer O\'Sullivan, in her "Comparative Children\'s Literature", notes "The Hobbit" as one of a handful of children\'s books that have been accepted into mainstream literature, alongside Jostein Gaarder\'s "Sophie\'s World" (1991) and J. K. Rowling\'s "Harry Potter" series (1997\xe2\x80\x932007).'
	 b'When certain liquids needs to be purified, siphoning can help prevent either the bottom (dregs) or the top (foam and floaties) from being transferred out of one container into a new container.'
	 b'Originally this world was self-contained, but as Tolkien began work on "The Lord of the Rings", he decided these stories could fit into the legendarium he had bee

---
## Dataset preparation

In [26]:
devset_iter = devset.as_numpy_iterator()
l = 0
for i in devset_iter:
    l += 1
l

73249

In [27]:
testset_iter = testset.as_numpy_iterator()
l = 0
for i in testset_iter:
    l += 1
l

118074

In [28]:
devset_iter.next()

StopIteration: 

In [None]:
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
wc_normal = []
wc_simple = []
for e in devset.as_numpy_iterator():
    sentence = str(e['normal_sentence'])
    wc = len(re.findall(r'\w+', sentence))
    wc_normal.append(wc)
    
    sentence = str(e['simple_sentence'])
    wc = len(re.findall(r'\w+', sentence))
    wc_simple.append(wc)

In [None]:
print(np.mean(wc_normal))
print(np.mean(wc_simple))

In [None]:
for e in devset.as_numpy_iterator():
    sent_norm = str(e['normal_sentence'])
    wc_norm = len(re.findall(r'\w+', sent_norm))
    
    sent_simp = str(e['simple_sentence'])
    wc_simp = len(re.findall(r'\w+', sent_simp))

    if wc_norm > 150:
        print(str(e['normal_sentence']))

In [None]:
data = {
    'Length': wc_normal + wc_simple,
    'Type': ['Normal'] * len(wc_normal) + ['Simple'] * len(wc_simple)
}
df = pd.DataFrame(data)

sns.set(style="ticks")
f, (ax_hist, ax_box) = plt.subplots(2, sharex=True, 
                                    gridspec_kw={"height_ratios": (.85, .15)})

sns.boxplot(x='Length', y='Type', hue='Type', data=df, ax=ax_box, orient='h', whis=[0, 100], palette={'Normal': 'blue', 'Simple': 'orange'})
ax_box.set(yticks=[], ylabel=None)
ax_box.set(xlabel=None)
ax_box.set_axis_off()

sns.histplot(x=wc_normal, bins=range(0, max(max(wc_normal), max(wc_simple)) + 1), ax=ax_hist, color='blue', alpha=0.5, label='Normal', stat='density')
sns.histplot(x=wc_simple, bins=range(0, max(max(wc_normal), max(wc_simple)) + 1), ax=ax_hist, color='orange', alpha=0.5, label='Simple', stat='density')
ax_hist.tick_params(axis='x', which='both', bottom=True, top=False, labelbottom=True)

sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)

ax_hist.legend()
ax_hist.set_title("Distribution of Sentence Lengths")
plt.show()

In [None]:
def select_fields(element):
    return element['normal_sentence'], element['simple_sentence']

transformed_dataset = devset.map(select_fields)
transformed_dataset

In [None]:
for step, (normal_sentence, simple_sentence) in enumerate(transformed_dataset):
    print(step)
    print(normal_sentence)
    print(simple_sentence)

---
## Model setup

---
## Model training