<a href="https://colab.research.google.com/github/pinballsurgeon/deluxo_adjacency/blob/main/LLM_Alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Note: Enable CUDA GPUs in runtime settings 

Install

In [3]:
# install transformers
!pip install transformers -q

# install openai
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Import

In [94]:
# standard dan-dard
import os
import io
import re
import numpy
import pandas
import string
import seaborn

# pytorch
import torch
torch.set_default_tensor_type(torch.cuda.FloatTensor)

# tensorflow
import tensorflow_hub
import tensorflow as tf
from tensorboard.plugins import projector
from tensorflow.keras.layers import TextVectorization
 
# universal encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
encoder_model = tensorflow_hub.load(module_url)

# set up a logs directory, so Tensorboard knows where to look for files.
log_dir='/logs/embedding_projector/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# LM transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed

# gpt3 
import openai

# supply openai api key via file 
openai.api_key = open('openai_key').read()

# load tensorboar dextension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Download

In [97]:
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b3", use_cache=True)
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b3")

Template

In [51]:
request_sentence = 'What is %s'
regex = re.compile('[^a-zA-Z ]')

Helper functions

In [69]:
### OPENAI - Ask Davinci - build openai completion request, define parameters
def get_openai_response(question):

    # format open au request
    response = openai.Completion.create(
                      engine="text-davinci-001",
                      prompt=question,
                      temperature=.5,
                      max_tokens=250,
                      top_p=1,
                      frequency_penalty=50,
                      presence_penalty=0 )

    # parse and process open ai response
    response_choices = response["choices"]

    # replace blanks
    response = response_choices[0]["text"].strip()
    
    # replace non-alphabet chars
    response = regex.sub('', response)

    return response

In [57]:
### OPENAI - Form Davinci request
def prompt_openai(topic, request_template):
  
  # build sentence and retrieve 
  retrieval = get_openai_response(request_template % topic)
  
  # return response
  return retrieval

In [95]:
### BLOOM - Generate Bloom response model
def get_bloom_response(question):

  response = model.generate(**question
                          , num_beams = 2
                          , num_beam_groups = 2
                          , top_k=1
                          , temperature=0.9
                          , repetition_penalty = 2.0
                          , diversity_penalty=2.0
                          , max_new_tokens = 20)
  
  return response


In [72]:
### BLOOM - Form request
def prompt_bloom(topic, request_template):

  # initiate tokenizer
  input_tokens = tokenizer(request_template % topic, return_tensors="pt").to(0)

  # build sentence and retrieve 
  retrieval = get_bloom_response(input_tokens)

  # decode retrieval
  retrieval = tokenizer.decode(retrieval[0], truncate_before_pattern=[r"\n\n^#", "^'''","\n\n\n"])

  # remove prompt from response
  retrieval = retrieval.replace(request_template % topic, '')

  # replace blanks
  retrieval = retrieval.strip()
  
  # replace non-alphabet chars
  retrieval = regex.sub('', retrieval)

  return retrieval


In [96]:
print(prompt_bloom(topic, request_sentence))

AttributeError: ignored

In [52]:
df = pandas.DataFrame(columns=['Topic','Openai_response'])
topic = 'Science'

openai_response = prompt_openai(topic, request_sentence)


df.loc[len(df.index)] = [topic, openai_response]

In [53]:
print(df['Openai_response'])

0    Science is a systematic way of acquiring knowl...
Name: Openai_response, dtype: object


In [9]:
prompt = f'what is today'

In [10]:
input_ids = tokenizer(prompt, return_tensors="pt").to(0)

In [11]:
sample = model.generate(**input_ids, num_beams = 2, num_beam_groups = 2, top_k=1, temperature=0.9, repetition_penalty = 2.0, diversity_penalty=2.0)

  "Passing `max_length` to BeamSearchScorer is deprecated and has no effect. "


In [12]:
print(tokenizer.decode(sample[0], truncate_before_pattern=[r"\n\n^#", "^'''","\n\n\n"]))

what is today known as the “New York Times”, and it was founded in 1851 by William


In [14]:
lst = ['Science', 'Math', 'Sailboat']

In [12]:
print(prompt_bloom())

In [13]:
# embed sentences
message_embeddings_ = model(df['text'])

# array product (build our 2d universe)
corr = numpy.inner(message_embeddings_, message_embeddings_)

# seaborn config
seaborn.set(font_scale=1.2, rc={'figure.figsize':(20,15)})

# seaborn plot
g = seaborn.heatmap(
    corr,
    xticklabels=df['titles'].str.slice(0,25), # vect paper titles, sliced for nice
    yticklabels=df['titles'].str.slice(0,25), # vect paper titles
    vmin=0,
    vmax=1,
    cmap="YlOrRd")

# rotate and label
g.set_xticklabels(df['titles'].str.slice(0,25), rotation=90)
g.set_title("Similarity")


NameError: ignored