### Using the TextVectorization layer

In [59]:
import string
import re
import string
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [61]:
class Vectorizer:
    def standardize(self, text):
        text = text.lower()
        return "".join(char for char in text if char not in string.punctuation)

    def tokenize(self, text):
        text = self.standardize(text)
        return text.split()

    def make_vocabulary(self, dataset):
        self.vocabulary = {"": 0, "[UNK]": 1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict(
            (v, k) for k, v in self.vocabulary.items())

    def encode(self, text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, 1) for token in tokens]

    def decode(self, int_sequence):
        return " ".join(
            self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

vectorizer = Vectorizer()
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]
vectorizer.make_vocabulary(dataset)

In [62]:
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

[2, 3, 5, 7, 1, 5, 6]


In [63]:
decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


In [64]:
text_vectorization = TextVectorization(
    output_mode="int",
)

In [65]:
def custom_standardization_fn(string_tensor):
    lowercase_string = tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(
        lowercase_string, f"[{re.escape(string.punctuation)}]", "")

def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
    output_mode="int",
    standardize=custom_standardization_fn,
    split=custom_split_fn,
)

In [66]:
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]
text_vectorization.adapt(dataset)

**Displaying the vocabulary**

In [67]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [68]:
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)


In [69]:
inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


## Two approaches for representing groups of words: Sets and sequences

### Load the IMDB movie reviews data

In [2]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  24.6M      0  0:00:03  0:00:03 --:--:-- 24.6M


In [3]:
!tar -xf aclImdb_v1.tar.gz

In [4]:
!rm -r aclImdb/train/unsup

In [5]:
!cat aclImdb/train/pos/4077_10.txt

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy

In [6]:
import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
    if not os.path.isdir(val_dir / category):
      os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)

In [7]:
import pandas as pd
df_train = pd.DataFrame(columns={'text', 'label'})
for i, category in enumerate(("neg", "pos")):
    files = os.listdir(train_dir / category)
    for f in files:
        ft = open(train_dir / category / f, "r")
        df_train = df_train.append(pd.DataFrame({'text':[ft.read()], 'label':[i]}))

In [8]:
df_val = pd.DataFrame(columns={'text', 'label'})
for i, category in enumerate(("neg", "pos")):
    files = os.listdir(val_dir / category)
    for f in files:
        ft = open(val_dir / category / f, "r")
        df_val = df_val.append(pd.DataFrame({'text':[ft.read()], 'label':[i]}))

In [9]:
df_train.shape, df_val.shape

((20000, 2), (5000, 2))

### Prepare Datasets

In [16]:
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [20]:
!pip install datasets --quiet

[K     |████████████████████████████████| 325 kB 4.4 MB/s 
[K     |████████████████████████████████| 212 kB 8.3 MB/s 
[K     |████████████████████████████████| 1.1 MB 10.6 MB/s 
[K     |████████████████████████████████| 134 kB 33.1 MB/s 
[K     |████████████████████████████████| 127 kB 28.9 MB/s 
[K     |████████████████████████████████| 271 kB 30.4 MB/s 
[K     |████████████████████████████████| 144 kB 8.1 MB/s 
[K     |████████████████████████████████| 94 kB 1.4 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[?25h

In [32]:
from datasets import Dataset
train_ds = Dataset.from_pandas(df_train, preserve_index = False)
valid_ds = Dataset.from_pandas(df_val, preserve_index = False)

**Displaying the shapes and dtypes of the first batch**

In [72]:
for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b"The script for this Columbo film seemed to be pulled right out of a sappy 1980's soap opera. Deeply character-driven films are great, but only if the characters are compelling. And in this film the only thing compelling was my desire to change the channel. The villain's dialog sounds as if it were written by a romance novelist. The great Lt. Columbo himself is no where near his famous, lovable, self-effacing, crumpled self; and the bride/kidnap victim is a whimpering, one-dimensional damsel-in-distress (she cowers in fear from a tiny scalpel held flimsily in the hand of her abductor - come on!!! I could have knocked the scalpel out of his hand and kicked him in the you-know-what in 2 seconds). In any sense of reality, this character would have at least TRIED to struggle or fight back at least a little. And speaking of reality....the story revolves around a kid

### LogisticRegression TfIdf Baseline

In [9]:
!pip install eli5 --quiet

[?25l[K     |███                             | 10 kB 20.3 MB/s eta 0:00:01[K     |██████▏                         | 20 kB 23.2 MB/s eta 0:00:01[K     |█████████▎                      | 30 kB 24.6 MB/s eta 0:00:01[K     |████████████▍                   | 40 kB 21.1 MB/s eta 0:00:01[K     |███████████████▌                | 51 kB 16.4 MB/s eta 0:00:01[K     |██████████████████▌             | 61 kB 18.5 MB/s eta 0:00:01[K     |█████████████████████▋          | 71 kB 16.8 MB/s eta 0:00:01[K     |████████████████████████▊       | 81 kB 15.8 MB/s eta 0:00:01[K     |███████████████████████████▉    | 92 kB 17.1 MB/s eta 0:00:01[K     |███████████████████████████████ | 102 kB 18.1 MB/s eta 0:00:01[K     |████████████████████████████████| 106 kB 18.1 MB/s 
[?25h

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt
import seaborn as sns
import eli5

In [33]:
text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150000)

In [34]:
X_train_text = text_transformer.fit_transform(df_train['text'])
X_test_text = text_transformer.transform(df_val['text'])

In [35]:
df_train['label'] = df_train['label'].astype('int')
df_val['label'] = df_val['label'].astype('int')

In [36]:
X_train_text.shape, X_test_text.shape

((20000, 150000), (5000, 150000))

In [54]:
logit = LogisticRegression(C=10e1, solver='lbfgs',  random_state=17, n_jobs=4)

In [55]:
%%time
logit.fit(X_train_text, df_train['label'].values)

CPU times: user 59.3 ms, sys: 20.1 ms, total: 79.4 ms
Wall time: 5.85 s


LogisticRegression(C=100.0, n_jobs=4, random_state=17)

In [56]:
from sklearn.metrics import accuracy_score

pred_train = logit.predict(X_train_text)
pred_test = logit.predict(X_test_text)

accuracy_score(df_val['label'].values, pred_test)

0.8946

In [45]:
eli5.show_weights(estimator=logit, 
                  feature_names= list(text_transformer.get_feature_names()),
                  top=(10, 10))



Weight?,Feature
+15.057,excellent
+14.699,great
+13.614,perfect
+12.550,best
+12.200,wonderful
+11.752,favorite
+10.473,amazing
+10.197,brilliant
+10.136,fun
+9.785,enjoyed


### Bag-of-words models

#### Single words (unigrams) with binary encoding

**Preprocessing our datasets with a `TextVectorization` layer**

In [73]:
text_vectorization = TextVectorization(
    max_tokens=20000,
    # standardize="lower_and_strip_punctuation",
    output_mode="multi_hot",
)
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

**Inspecting the output of our binary unigram dataset**

In [74]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


**Our model-building utility**

In [75]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model

**Training and testing the binary unigram model**

In [76]:
model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_1gram.keras",
                                    save_best_only=True)
]
history = model.fit(binary_1gram_train_ds.cache(),
          validation_data=binary_1gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.888


In [None]:
import matplotlib.pyplot as plt
history_dict = history.history
loss_values = history_dict["loss"]
val_loss_values = history_dict["val_loss"]
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, "bo", label="Training loss")
plt.plot(epochs, val_loss_values, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
plt.clf()
acc = history_dict["accuracy"]
val_acc = history_dict["val_accuracy"]
plt.plot(epochs, acc, "bo", label="Training acc")
plt.plot(epochs, val_acc, "b", label="Validation acc")
plt.title("Training and validation accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

#### Bigrams with binary encoding

**Configuring the `TextVectorization` layer to return bigrams**

In [None]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="multi_hot",
)

**Training and testing the binary bigram model**

In [None]:
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_2gram.keras",
                                    save_best_only=True)
]
model.fit(binary_2gram_train_ds.cache(),
          validation_data=binary_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)
model = keras.models.load_model("binary_2gram.keras")
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

#### Bigrams with TF-IDF encoding

**Configuring the `TextVectorization` layer to return token counts**

In [None]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="count"
)

**Configuring `TextVectorization` to return TF-IDF-weighted outputs**

In [None]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="tf_idf",
)

**Training and testing the TF-IDF bigram model**

In [None]:
text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("tfidf_2gram.keras",
                                    save_best_only=True)
]
model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)
model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

In [None]:
inputs = keras.Input(shape=(1,), dtype="string")
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)
inference_model = keras.Model(inputs, outputs)

In [None]:
import tensorflow as tf
raw_text_data = tf.convert_to_tensor([
    ["That was an excellent movie, I loved it."],
])
predictions = inference_model(raw_text_data)
print(f"{float(predictions[0] * 100):.2f} percent positive")

## Fine tune bert model

In [10]:
import re
import string
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [11]:
!pip install transformers --quiet

[K     |████████████████████████████████| 3.8 MB 5.1 MB/s 
[K     |████████████████████████████████| 895 kB 44.0 MB/s 
[K     |████████████████████████████████| 67 kB 4.4 MB/s 
[K     |████████████████████████████████| 6.5 MB 34.3 MB/s 
[K     |████████████████████████████████| 596 kB 48.0 MB/s 
[?25h

In [39]:
from transformers import AutoModelForSequenceClassification
hf_model = 'distilbert-base-uncased-finetuned-sst-2-english'
#hf_model = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(hf_model, num_labels=2)

https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpi8rpbbbe


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/4e60bb8efad3d4b7dc9969bf204947c185166a0a3cf37ddb6f481a876a3777b5.9f8326d0b7697c7fd57366cdde57032f46bc10e37ae81cb7eb564d66d23ec96b
creating metadata file for /root/.cache/huggingface/transformers/4e60bb8efad3d4b7dc9969bf204947c185166a0a3cf37ddb6f481a876a3777b5.9f8326d0b7697c7fd57366cdde57032f46bc10e37ae81cb7eb564d66d23ec96b
loading configuration file https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/4e60bb8efad3d4b7dc9969bf204947c185166a0a3cf37ddb6f481a876a3777b5.9f8326d0b7697c7fd57366cdde57032f46bc10e37ae81cb7eb564d66d23ec96b
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout":

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/8d04c767d9d4c14d929ce7ad8e067b80c74dbdb212ef4c3fb743db4ee109fae0.9d268a35da669ead745c44d369dc9948b408da5010c6bac414414a7e33d5748c
creating metadata file for /root/.cache/huggingface/transformers/8d04c767d9d4c14d929ce7ad8e067b80c74dbdb212ef4c3fb743db4ee109fae0.9d268a35da669ead745c44d369dc9948b408da5010c6bac414414a7e33d5748c
loading weights file https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/8d04c767d9d4c14d929ce7ad8e067b80c74dbdb212ef4c3fb743db4ee109fae0.9d268a35da669ead745c44d369dc9948b408da5010c6bac414414a7e33d5748c
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-bas

In [40]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model)

In [42]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tf_train_ds = train_ds.map(tokenize_function, batched=True)
tf_eval_ds = valid_ds.map(tokenize_function, batched=True)

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [43]:
tf_train_ds

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 20000
})

In [44]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  train_dataset=tf_train_ds, 
                  eval_dataset=tf_eval_ds)


In [45]:
trainer.train()

***** Running training *****
  Num examples = 20000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7500


Step,Training Loss
500,0.3622
1000,0.3397
1500,0.3045
2000,0.2968
2500,0.2938
3000,0.1656
3500,0.1859
4000,0.1373
4500,0.1708
5000,0.1573


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

TrainOutput(global_step=7500, training_loss=0.18203812128702798, metrics={'train_runtime': 5945.5671, 'train_samples_per_second': 10.092, 'train_steps_per_second': 1.261, 'total_flos': 7948043919360000.0, 'train_loss': 0.18203812128702798, 'epoch': 3.0})

In [46]:
os.makedirs("models")
model.save_pretrained("models")

Configuration saved in models/config.json
Model weights saved in models/pytorch_model.bin


In [49]:
!ls -lh models/pytorch_model.bin.gz

-rw-r--r-- 1 root root 236M Mar 16 20:17 models/pytorch_model.bin.gz
