In [1]:
# Get Gigaword dataset

from huggingface_hub import hf_hub_download

hf_hub_download(repo_id='Harvard/gigaword', filename='data/ggw_data.zip', repo_type='dataset')

  from .autonotebook import tqdm as notebook_tqdm


'/home/ijackson/.cache/huggingface/hub/datasets--Harvard--gigaword/snapshots/e45e01b2da13842bb3df1b12dc046910147b3d82/data/ggw_data.zip'

In [2]:
# Load the dataset
from gigaword import Gigaword
gw = Gigaword()
r = gw.download_and_prepare(output_dir='./output_dir')

In [1]:
import pyarrow as pa

train1 = pa.ipc.RecordBatchStreamReader('./output_dir/gigaword-train-00000-of-00002.arrow').read_all()
train2 = pa.ipc.RecordBatchStreamReader('./output_dir/gigaword-train-00001-of-00002.arrow').read_all()
train = pa.concat_tables([train1, train2])
test = pa.ipc.RecordBatchStreamReader('./output_dir/gigaword-test.arrow').read_all()
val = pa.ipc.RecordBatchStreamReader('./output_dir/gigaword-validation.arrow').read_all()

# split into input and target

train_input = [str(input_) for input_ in train['document']]
train_target = [str(target) for target in train['summary']]
test_input = [str(input_) for input_ in test['document']]
test_target = [str(target) for target in test['summary']]
val_input = [str(input_) for input_ in val['document']]
val_target = [str(target) for target in val['summary']]


In [2]:
# make a small subset of the data
numTrain = 1000
numTest = 100
train_input = train_input[:numTrain]
train_target = train_target[:numTrain]
test_input = test_input[:numTest]
test_target = test_target[:numTest]

In [3]:
# set up encodings
train_input_encodings = []
train_target_encodings = []
test_input_encodings = []
test_target_encodings = []

In [None]:
# use BART to encode the training texts
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Initialize the BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')


for i in range(len(train_input)):
    x_inputs = tokenizer(train_input[i], return_tensors='pt')
    y_inputs = tokenizer(train_target[i], return_tensors='pt')
    encoder_outputs_x = model.get_encoder()(**x_inputs)
    encoder_outputs_y = model.get_encoder()(**y_inputs)
    train_input_encodings.append(encoder_outputs_x.last_hidden_state)
    train_target_encodings.append(encoder_outputs_y.last_hidden_state)

In [7]:
# same for test texts
for i in range(len(test_input)):
    x_inputs = tokenizer(test_input[i], return_tensors='pt')
    y_inputs = tokenizer(test_target[i], return_tensors='pt')
    encoder_outputs_x = model.get_encoder()(**x_inputs)
    encoder_outputs_y = model.get_encoder()(**y_inputs)
    test_input_encodings.append(encoder_outputs_x.last_hidden_state)
    test_target_encodings.append(encoder_outputs_y.last_hidden_state)

In [8]:
# create reservoir computing model and feed the encodings as a time series
import reservoirpy as rpy
import numpy as np

rpy.verbosity(0)
rpy.set_seed(42)
nNeurons = 500
learning_rate = 0.5
spectral_radius = 0.9
reservoir = rpy.nodes.Reservoir(nNeurons, lr=learning_rate, sr=spectral_radius)
res_states = []
# get reservoir states for each brain state
reset = True
for token_embedding in train_input_encodings:
   token_embedding = token_embedding.detach().numpy().squeeze()
   timeseries_states = reservoir.run(token_embedding, reset=reset)
   res_states.append(timeseries_states[-1]) # take the last state

res_states_training = np.array(res_states)
res_states_training.shape

(1000, 500)

In [9]:
# need target states to be the same shape so we'll pad/truncate them as needed
def pad_or_truncate(sequence, max_length=12): # from chatgpt
    if len(sequence) < max_length:
        # Pad with zeros if shorter
        return np.pad(sequence, ((0, max_length - len(sequence)), (0, 0)), mode='constant')
    else:
        # Truncate if longer
        return sequence[:max_length]

In [10]:
# make the target encodings are the same shape

train_target_encodings = np.array([pad_or_truncate(state.detach().numpy().squeeze()) for state in train_target_encodings])
test_target_encodings = np.array([pad_or_truncate(state.detach().numpy().squeeze()) for state in test_target_encodings])

train_target_encodings.shape

(1000, 12, 1024)

In [11]:
# flatten the encodings too this is probably a bad idea aha but we'll see
train_target_encodings = train_target_encodings.reshape((train_target_encodings.shape[0], -1))
test_target_encodings = test_target_encodings.reshape((test_target_encodings.shape[0], -1))

In [12]:
# train the readout on the training data
#train_target_encodings = [target.detach().numpy().squeeze() for target in train_target_encodings]
readout = rpy.nodes.Ridge(ridge=1e-7)
readout = readout.fit(res_states_training, train_target_encodings, warmup=10)

In [57]:
train_target_encodings[0].shape

(12288,)

In [13]:
res_states = []
# get reservoir states for test states
reset = True
for token_embedding in test_input_encodings:
   token_embedding = token_embedding.detach().numpy().squeeze()
   timeseries_states = reservoir.run(token_embedding, reset=reset)
   res_states.append(timeseries_states[-1]) # take the last state

res_states_test = np.array(res_states)
res_states_test.shape

(100, 500)

In [65]:
# get the readout predictions
predictions = readout.run(res_states_test)
predictions.shape

(100, 12288)

In [66]:
# build base model output object from predictions
from transformers.modeling_outputs import BaseModelOutput

def build_base_model_output_from_predictions(prediction):
    # first reshape the prediction to be the same shape as the hidden states
    prediction = prediction.reshape((12, 1024))
    # need to make it a batch size of 64 or else decoder gets mad
    prediction = np.array([prediction])
    # make it a float
    prediction = prediction.astype('float32')
    # now build the batch encoding object
    bmo = BaseModelOutput(last_hidden_state=torch.tensor(prediction))
    # attention mask should be 1s up to max_length and 0s after
    # batch_encoding['attention_mask'] = torch.tensor([[1]*12 + [0]*38]*prediction.shape[0])
    return bmo

In [67]:
base_model_predictions = [build_base_model_output_from_predictions(prediction) for prediction in predictions]

In [62]:
base_model_predictions[0].last_hidden_state.shape

torch.Size([12, 1024])

In [145]:
len(base_model_predictions)

200

In [63]:
decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]])


In [72]:
# now we can decode the predictions as text
# decode a small range
# decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]])

decoded_texts = []
num_texts = 10
start = np.random.randint(0, len(base_model_predictions) - num_texts)
range = [start, start + 10]

for prediction in base_model_predictions[range[0]:range[1]]:
    # Generate text from the embeddings
    try:
        generated_ids = model.generate(decoder_input_ids=decoder_input_ids, encoder_outputs=prediction, max_length=50)

    except Exception as e:
        print(prediction.last_hidden_state.shape)
        print(e)
        break
    # Decode the generated ids to text
    decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    decoded_texts.append(decoded_text)


In [73]:
decoded_texts

['U.YI/U.S.air, TUI, NU and N0G go in to one-U.N. for one in one, to one in three.T.I/OSN,',
 ' l k l d dia l k ke. le d d k l kto le d k k',
 '',
 'Finance Minister of the Bank of Japan (BoT) says it will bring into an aero-mono plan of an omer-Tobel at the end of the year. e.t.m.e in',
 'fopan gaiopan on the prowl for the UN, s.s.aopan for the United States of S.E.A.D. president of the Republic of the D.S.C.']

In [74]:
test_target[range[0]:range[1]]

['mittal launches hostile arcelor bid in us',
 'time not ripe yet for indian mangoes to hit us',
 '#.# billion tv viewers expected for opening world cup match',
 'rumsfeld calls zarqawi death significant victory',
 'french farm offers hope for endangered asian crocs UNK picture']