<a href="https://colab.research.google.com/github/pavanchhatpar/sentence-splitter/blob/master/Sentence_splitting_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download and Extract data

In [None]:
import gdown
import tempfile
import os
import zipfile

In [None]:
fileids = {
    "DiscoveryBase": "1bqyml-AD5EbisExKW5Vss4-Q_IhXy3lM",
    "DiscoveryHard": "18NDudOIkSp86FtIEB9Wo3rIpYtxdGxI3",
    "DiscoveryBig": "1OLgiGd3CCWIkROyTABS2APjplRiRJPoO",
}
data_path = "/content/data"

In [None]:
if not os.path.exists(data_path):
  os.makedirs(data_path)

In [None]:
for fileid in fileids.values():
  url = f"https://drive.google.com/uc?id={fileid}"
  with tempfile.NamedTemporaryFile() as f:
    gdown.download(url, f.name, False)
    z = zipfile.ZipFile(f)
    z.extractall(data_path)

Downloading...
From: https://drive.google.com/uc?id=1bqyml-AD5EbisExKW5Vss4-Q_IhXy3lM
To: /tmp/tmp7_i68lwc
158MB [00:02, 59.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=18NDudOIkSp86FtIEB9Wo3rIpYtxdGxI3
To: /tmp/tmp9xhpu1br
158MB [00:02, 60.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1OLgiGd3CCWIkROyTABS2APjplRiRJPoO
To: /tmp/tmp8wh6p3je
316MB [00:05, 61.8MB/s]


# Process data

In [None]:
import pandas as pd

In [None]:
dfs = {}

for dirname in fileids.keys():
  dataset_path = os.path.join(data_path, dirname)
  l_df=[]
  for cv in ["train","test","dev"]:
      df_cv=pd.DataFrame(list(zip(
          open(f"{dataset_path}/s1.{cv}").read().splitlines(),
          open(f"{dataset_path}/s2.{cv}").read().splitlines(),
          open(f"{dataset_path}/labels.{cv}").read().splitlines()))
          ,columns=["s1","s2","y"])
      df_cv["set"]=cv
      l_df+=[df_cv]
      
  dfs[dirname] = pd.concat(l_df)

### Join sentence 1 and sentence 2

In [None]:
for df in dfs.values():
  df.loc[:, "full"] = (
      df.s1.str[:-1] # remove punctuation from end, but not the space chars
      + df.y.str[:-1] # remove last ','
      + "  " # double space for consistency with the existing space format
      # + df.s2.str[:1].str.lower().str.cat(df.s2.str[1:]) # de-capitalize first letter
      + df.s2 # Need not de-capitalize first letter because tokenizer used is case insensitive in this case
  )

In [None]:
dfs["DiscoveryBase"].head()

Unnamed: 0,s1,s2,y,set,full
0,He helped to found the Mexican American ...,Sanchez became involved with the American...,"subsequently,",train,He helped to found the Mexican American ...
1,Then click the `` Paper Clip '' button ...,You can use any handheld device that ru...,"alternately,",train,Then click the `` Paper Clip '' button ...
2,That 's a long way for a program that ...,FAU is a program that under Cooney and ...,"presently,",train,That 's a long way for a program that ...
3,"FORT DRUM , N.Y. - Throughout its histo...",Soldiers receive assignments to multiple ...,"typically,",train,"FORT DRUM , N.Y. - Throughout its histo..."
4,They continued to dig noting that there ...,Every ten feet they found a layer of l...,"curiously,",train,They continued to dig noting that there ...


### Filter useful sentences

In [None]:
dataparts = []
for src, df in dfs.items():
  newdf = {}
  # If <s1> then, <s2>
  subdf = df[(df.y == 'then,')&(df.s1.str.startswith('If ')&~(df.s1.str.contains(','))&~(df.s1.str.contains(' then ')))][['full', 'set']].to_dict(orient='list')
  newdf['sentence'] = subdf['full']
  newdf['set'] = subdf['set']

  # s1 => "If ... (then|,) ..."
  subdf = df[(df.s1.str.startswith('If '))&((df.s1.str.contains(','))|(df.s1.str.contains(' then ')))][['s1', 'set']].to_dict(orient='list')
  newdf['sentence'].extend(subdf['s1'])
  newdf['set'].extend(subdf['set'])

  # s1 => "If ... (then|,) ..."
  subdf = df[(df.s2.str.startswith('If '))&((df.s2.str.contains(','))|(df.s2.str.contains(' then ')))][['s2', 'set']].to_dict(orient='list')
  newdf['sentence'].extend(subdf['s2'])
  newdf['set'].extend(subdf['set'])

  newdf = pd.DataFrame(newdf)
  newdf['src'] = src
  dataparts.append(newdf)

data = pd.concat(dataparts)

In [None]:
print(data.shape)
data.head()

(203725, 3)


Unnamed: 0,sentence,set,src
0,If your site is titled `` post generati...,train,DiscoveryBase
1,If you want to compete then See where ...,train,DiscoveryBase
2,If you have time for two things in Sav...,train,DiscoveryBase
3,If you 're having crash issues - delete...,train,DiscoveryBase
4,If a driver made one small mistake I f...,train,DiscoveryBase


In [None]:
# Some sentences might repeat in all three data sources

data = data[~data.sentence.duplicated()]
print(data.shape)
data.head()

(177587, 3)


Unnamed: 0,sentence,set,src
0,If your site is titled `` post generati...,train,DiscoveryBase
1,If you want to compete then See where ...,train,DiscoveryBase
2,If you have time for two things in Sav...,train,DiscoveryBase
3,If you 're having crash issues - delete...,train,DiscoveryBase
4,If a driver made one small mistake I f...,train,DiscoveryBase


### Tokenize data

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |████████████████████████████████| 757kB 17.1MB/s 
[?25hCollecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 45.6MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 55.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K  

In [None]:
from transformers import ElectraTokenizerFast

In [None]:
tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-base-discriminator")

In [None]:
data['tokenized'] = data.sentence.map(tokenizer.tokenize)

In [None]:
data.tokenized.map(lambda l : len(l)).describe(percentiles=[.25, .5, .75, .95])

count    177587.00000
mean         25.38919
std           7.51887
min           5.00000
25%          20.00000
50%          25.00000
75%          31.00000
95%          38.00000
max          73.00000
Name: tokenized, dtype: float64

In [None]:
max_len = 50  # roughly between 95% to 100% lengths

### Find first 'then' or ',' in tokenized data - target token (split_pos)

In [None]:
data['split_pos'] = data.tokenized.map(lambda l: l.index('then')+1 if 'then' in l else l.index(',')+1)

### Remove the connective token at split_pos

In [None]:
data['tokenized'] = data.apply(lambda row: row.tokenized[:row.split_pos-1] + row.tokenized[row.split_pos:], axis=1)

### Encode tokens to IDs thus getting final features

In [None]:
data.set.value_counts()

train    159907
test       8928
dev        8752
Name: set, dtype: int64

In [None]:
sets = [
      "train",
      "test",
      "dev"
]

In [None]:
features = {}
for split in sets:
  subset = data[data.set == split]
  features[split] = tokenizer(list(subset.tokenized.values), is_pretokenized=True, max_length=max_len, padding=True, truncation=True)
  features[split]['split_position'] = list(subset.split_pos.values)

### Convert to TF Dataset for train, test and validation

In [None]:
import tensorflow as tf
from functools import partial

In [None]:
def gen(split, allfeatures):
  features = allfeatures[split]
  for input_id, token_type_id, attention_mask, split_position in zip(features['input_ids'], features['token_type_ids'], features['attention_mask'], features['split_position']):
    yield ({
        "input_ids": input_id,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_id,},
        split_position)

In [None]:
tf_datasets = {}
for split in sets:
  dataset = tf.data.Dataset.from_generator(
      partial(gen, split=split, allfeatures=features),
      ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int32),
      ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None])}, tf.TensorShape([]))
  )
  tf_datasets[split] = dataset

In [None]:
tf_datasets['train'].element_spec

({'attention_mask': TensorSpec(shape=(None,), dtype=tf.int32, name=None),
  'input_ids': TensorSpec(shape=(None,), dtype=tf.int32, name=None),
  'token_type_ids': TensorSpec(shape=(None,), dtype=tf.int32, name=None)},
 TensorSpec(shape=(), dtype=tf.int32, name=None))

In [None]:
next(tf_datasets['train'].as_numpy_iterator())

({'attention_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1], dtype=int32),
  'input_ids': array([  101,  2065,  2115,  2609,  2003,  4159,  1036,  1036,  2695,
          4245,  1060, 11265,  1001,  1001, 16428,  2015,  1057,  1011,
          9152,  1001,  1001,  8915,   999,  1005,  2092,  1010,  2017,
          2113,  1010,  3071,  2097,  2022,  2559,  2012,  1996,  3931,
          1999,  2543,  1001,  1001,  1042,  2080,  1001,  1001,  1060,
          2012, 14883,  1001,  1001,   102], dtype=int32),
  'token_type_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0], dtype=int32)},
 19)

### Save processed datasets

In [None]:
def make_example(X, y):
    serialized = tf.py_function(
        serialize,
        [X['input_ids'], X['attention_mask'], X['token_type_ids'], y],
        tf.string
    )
    return tf.reshape(serialized, ())

def bytes_feature(value):
    """Returns a bytes_list from a string / byte."""

    # BytesList won't unpack a string from an EagerTensor.
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def serialize(input_ids, attention_mask, token_type_ids, y):
    input_ids = tf.io.serialize_tensor(input_ids)
    attention_mask = tf.io.serialize_tensor(attention_mask)
    token_type_ids = tf.io.serialize_tensor(token_type_ids)
    label = tf.io.serialize_tensor(y)
    feature = {
        "input_ids": bytes_feature(input_ids),
        "attention_mask": bytes_feature(attention_mask),
        "token_type_ids": bytes_feature(token_type_ids),
        "label": bytes_feature(label),
    }
    example_proto = tf.train.Example(
        features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
train = tf_datasets['train'].map(make_example)
test = tf_datasets['test'].map(make_example)
dev = tf_datasets['dev'].map(make_example)

In [None]:
def save(train, test, dev, location):
    if not os.path.exists(location):
        os.makedirs(location)
    if not os.path.isdir(location):
        raise ValueError(f"{location} should be a directory!")
    print("******** Saving Dev set ********")
    fname = os.path.join(location, "dev.tfrecord")
    writer = tf.data.experimental.TFRecordWriter(fname, "ZLIB")
    writer.write(dev)

    print("******** Saving Test set ********")
    fname = os.path.join(location, "test.tfrecord")
    writer = tf.data.experimental.TFRecordWriter(fname, "ZLIB")
    writer.write(test)

    print("******** Saving Training set ********")
    fname = os.path.join(location, "train.tfrecord")
    writer = tf.data.experimental.TFRecordWriter(fname, "ZLIB")
    writer.write(train)
    print("******** Finished saving dataset ********")

In [None]:
save(train, test, dev, "./processed_data")

******** Saving Dev set ********
******** Saving Test set ********
******** Saving Training set ********
******** Finished saving dataset ********


### Load saved dataset

In [None]:
def parse_ex(example_proto):
    feature_description = {
        'input_ids': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'attention_mask': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'token_type_ids': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'label': tf.io.FixedLenFeature([], tf.string, default_value=''),
    }
    example = tf.io.parse_single_example(
        example_proto, feature_description)
    input_ids = tf.io.parse_tensor(example['input_ids'], out_type=tf.int32)
    input_ids.set_shape([None, ])
    attention_mask = tf.io.parse_tensor(example['attention_mask'], out_type=tf.int32)
    attention_mask.set_shape([None, ])
    token_type_ids = tf.io.parse_tensor(example['token_type_ids'], out_type=tf.int32)
    token_type_ids.set_shape([None, ])
    label = tf.io.parse_tensor(example['label'], out_type=tf.int32)
    label.set_shape([])
    return ({"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids}, label)

In [None]:
def load(location):
  train = os.path.join(location, "train.tfrecord")
  dev = os.path.join(location, "dev.tfrecord")
  test = os.path.join(location, "test.tfrecord")

  train = tf.data.TFRecordDataset([train], compression_type='ZLIB')
  dev = tf.data.TFRecordDataset([dev], compression_type='ZLIB')
  test = tf.data.TFRecordDataset([test], compression_type='ZLIB')

  AUTOTUNE = tf.data.experimental.AUTOTUNE
  train = train.map(parse_ex, num_parallel_calls=AUTOTUNE)
  dev = dev.map(parse_ex, num_parallel_calls=AUTOTUNE)
  test = test.map(parse_ex, num_parallel_calls=AUTOTUNE)

  return train, test, dev

In [None]:
train, test, dev = load("./processed_data")

In [None]:
train

<ParallelMapDataset shapes: ({input_ids: (None,), attention_mask: (None,), token_type_ids: (None,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32, token_type_ids: tf.int32}, tf.int32)>

# Modeling a sentence splitting network

### ELECTRA
 - Proposes a new method to train a language model borrowing concepts from GAN
 - It does have a generator and discriminator but it is not training in an adversarial fashion, uses just the maximum likelihood loss
 - The generator is a Masked LM and the discriminator predicts if a token in the sequence is real/ replaced.
 - Discriminator becomes the final LM

### Sentence Splitting Head
 - We use the pretrained ELECTRA model and replace its binary real/replaced final layer with a softmax layer of vocab_size
 - ELECTRA is just a choice, other pretrained LMs can also be used

In [None]:
from transformers import TFElectraPreTrainedModel
from transformers.modeling_tf_electra import TFElectraMainLayer
import tensorflow as tf

In [None]:
class TFSentenceSplitLoss:
  def compute_loss(self, labels, logits):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    split_loss = loss_fn(labels, logits)
    print(f"Loss {split_loss}")
    return split_loss

In [None]:
class TFElectraForSentenceSplitting(TFElectraPreTrainedModel, TFSentenceSplitLoss):
  def __init__(self, config, *inputs, **kwargs):
    super(TFElectraForSentenceSplitting, self).__init__(config, *inputs, **kwargs)
    self.electra = TFElectraMainLayer(config, name="electra")
    self.split_output = tf.keras.layers.Dense(
        1,
        name="split_output")

  def call(
      self, 
      inputs=None,
      attention_mask=None,
      token_type_ids=None,
      position_ids=None,
      head_mask=None,
      inputs_embeds=None,
      output_attentions=None,
      output_hidden_states=None,
      labels=None,
      training=False):
    discriminator_hidden_states = self.electra(
        inputs=inputs,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        training=training,
    )
    discriminator_sequence_output = discriminator_hidden_states[0]

    # (batch_size, seq_len, 1)
    logits = self.split_output(discriminator_sequence_output)

    # (batch_size, seq_len)
    split_logits = tf.squeeze(logits, axis=-1)

    outputs = (split_logits,) + discriminator_hidden_states[1:]

    if labels is not None:
      loss = self.compute_loss(labels, outputs[0])
      outputs = (loss,) + outputs

    return outputs  # (loss), split_logits, (hidden_states), (attentions)

# Training

In [None]:
model = TFElectraForSentenceSplitting.from_pretrained("google/electra-base-discriminator")

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing TFElectraForSentenceSplitting: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraForSentenceSplitting from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFElectraForSentenceSplitting from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFElectraForSentenceSplitting were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['split_output']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [None]:
!nvidia-smi

Thu Jul  2 14:56:22 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |   9153MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# There are some long sentences where the connective goes beyond `max_len`
# We need to remove them so that our training model doesn't crash

train = train.filter(lambda X, y: y < 50)
dev = dev.filter(lambda X, y: y < 50)
test = test.filter(lambda X, y: y < 50)

In [None]:
batch_size = 32
model.fit(train.batch(batch_size), epochs=2, validation_data=dev.batch(batch_size))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fa0601a58d0>

In [None]:
model.evaluate(train.batch(batch_size))



[0.14421269297599792, 0.9543038606643677]

In [None]:
model.evaluate(dev.batch(batch_size))



[0.2826082408428192, 0.9180758595466614]

In [None]:
model.evaluate(test.batch(batch_size))



[0.2956418991088867, 0.9187948107719421]

# Model pipeline
 - A very useful feature from the transformers library
 - Abstracts the entire text pre-processing and model prediction work into an end-to-end pipeline

In [None]:
from transformers import pipeline

In [None]:
nlp = pipeline("sentiment-analysis", model, model.config, tokenizer, "tf")

In [None]:
data = next(dev.as_numpy_iterator())[0]

In [None]:
tokenizer.vocab_size

30522

In [None]:
import numpy as np

In [None]:
model.config.id2label = dict(zip(np.arange(tokenizer.vocab_size), (np.arange(tokenizer.vocab_size) - 1).astype(str)))

### Save pipeline

In [None]:
del model.config.id2label # not needed, unnecessary use of space

In [None]:
total_variables = 0
for var in model.variables:
  total_variables += tf.size(var)
print("Total", total_variables.numpy(), "variables")

Total 108892417 variables


In [None]:
nlp.save_pretrained("./saved_model")

In [None]:
!ls -lh ./saved_model

total 416M
-rw-r--r-- 1 root root  585 Jul  2 17:32 config.json
-rw-r--r-- 1 root root  112 Jul  2 17:32 special_tokens_map.json
-rw-r--r-- 1 root root 416M Jul  2 17:32 tf_model.h5
-rw-r--r-- 1 root root   48 Jul  2 17:32 tokenizer_config.json
-rw-r--r-- 1 root root 227K Jul  2 17:32 vocab.txt


### Load pipeline

In [None]:
model1 = TFElectraForSentenceSplitting.from_pretrained("./saved_model")

All model checkpoint weights were used when initializing TFElectraForSentenceSplitting.

All the weights of TFElectraForSentenceSplitting were initialized from the model checkpoint at ./saved_model.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFElectraForSentenceSplitting for predictions without further training.


In [None]:
tokenizer1 = ElectraTokenizerFast.from_pretrained("./saved_model")

In [None]:
model1.config.id2label = dict(zip(np.arange(tokenizer1.vocab_size), np.arange(tokenizer1.vocab_size) - 1))

In [None]:
nlp1 = pipeline("sentiment-analysis", model1, model1.config, tokenizer1, 'tf')

# Other ways to train - not working with TF 2.2

In [None]:
from transformers import TFTrainer, TFTrainingArguments

In [None]:
training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train.batch(32),    # tensorflow_datasets training dataset
    eval_dataset=dev.batch(32)       # tensorflow_datasets evaluation dataset
)

In [None]:
trainer.train()

Instructions for updating:
renamed to `run`


TypeError: ignored