In [1]:
!pip install spacy
!pip install torch
!pip install transformers
!python -m spacy download en_core_web_sm

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [2]:
import spacy
import torch
from transformers import TransfoXLTokenizer, TransfoXLModel

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# spaCy Tokenization and Padding Function
def spacy_tokenize_and_pad(sentences, n):
    tokenized_sentences = []
    for sentence in sentences:
        doc = nlp(sentence)
        tokens = [token.text for token in doc]
        token_ids = [vocab.get(token, 0) for token in tokens]
        if len(token_ids) < n:
            token_ids += [0] * (n - len(token_ids))  # Padding with 0
        tokenized_sentences.append(token_ids[:n])
    return tokenized_sentences

def spacy_decode_tokens(token_ids, vocab):
    reverse_vocab = {v: k for k, v in vocab.items()}
    tokens = [reverse_vocab.get(token_id, "") for token_id in token_ids]
    return tokens

# Example sentences
sentences = [
    "As the aircraft becomes lighter, it flies higher in air of lower density to maintain the same airspeed.",
    "When the engine heats up, it operates more efficiently, consuming less fuel to maintain speed."
]

# Create a simple vocabulary based on the unique tokens in the sentences
vocab = {word: idx + 1 for idx, word in enumerate(set(" ".join(sentences).split()))}

n = 20  # Desired length of each sentence after padding

# Tokenization
spacy_tokenized_sentences = spacy_tokenize_and_pad(sentences, n)
for ts in spacy_tokenized_sentences:
    print(ts)
    print(spacy_decode_tokens(ts, vocab))
    print("\n")


[14, 6, 4, 12, 0, 0, 1, 7, 18, 11, 22, 10, 23, 8, 13, 19, 6, 28, 0, 0]
['As', 'the', 'aircraft', 'becomes', '', '', 'it', 'flies', 'higher', 'in', 'air', 'of', 'lower', 'density', 'to', 'maintain', 'the', 'same', '', '']


[21, 6, 24, 15, 0, 0, 1, 3, 17, 0, 0, 16, 5, 9, 13, 19, 0, 0, 0, 0]
['When', 'the', 'engine', 'heats', '', '', 'it', 'operates', 'more', '', '', 'consuming', 'less', 'fuel', 'to', 'maintain', '', '', '', '']




In [7]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Load pre-trained GPT-2 tokenizer and model
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a padding token to the tokenizer
gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the model and resize its token embeddings to include the new special tokens
gpt2_model = GPT2Model.from_pretrained('gpt2')
gpt2_model.resize_token_embeddings(len(gpt2_tokenizer))

# GPT-2 Tokenization and Processing
def gpt2_tokenize_and_process(sentences):
    inputs = gpt2_tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = gpt2_model(**inputs)
    return outputs

# Example sentences processed with GPT-2
sentences = ["This is an example sentence.", "Here is another one."]
gpt2_outputs = gpt2_tokenize_and_process(sentences)
print(gpt2_outputs)


BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[ 5.2970e-02, -1.3685e-02, -2.3932e-01,  ..., -1.2450e-01,
          -1.1159e-01,  2.2529e-02],
         [ 2.4703e-01,  2.2600e-01,  3.9671e-02,  ...,  2.4134e-01,
           4.3486e-01,  1.7679e-01],
         [ 7.4826e-01, -4.0515e-01, -9.3823e-01,  ...,  3.6463e-01,
          -2.8712e-02,  3.7218e-01],
         [ 1.9904e-01, -3.6951e-01, -1.8210e+00,  ..., -1.7723e-01,
           9.3090e-03,  1.6472e-01],
         [ 7.0422e-02, -5.3654e-02, -2.5189e+00,  ...,  5.8238e-02,
          -1.2165e-01, -3.8434e-01],
         [ 1.9333e-01, -2.1837e-01, -4.5858e-01,  ...,  7.2579e-02,
           6.4142e-03, -3.8246e-01]],

        [[-2.5430e-03,  6.0642e-02, -3.0736e-01,  ..., -1.1276e-01,
          -1.0261e-01, -2.1344e-02],
         [-8.4581e-03,  1.2329e-01,  1.2254e-02,  ...,  2.8963e-01,
           6.7839e-01,  2.0188e-01],
         [ 4.3381e-01, -4.5221e-01, -6.8891e-01,  ...,  7.6885e-04,
           7.8252e-03,  2.6686