In [2]:
import os

import lucem_illud
import openai
import pandas as pd
from openai import OpenAI

# Constants, Utility Functions, and Data Importing

In [3]:
# Constants and Clients
MAX_CHAR_LEN = 5000000
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [36]:
# Utility Functions


def generate_text_with_style(prompt: str):
    """
    This function takes a piece of text and uses OpenAI's GPT-4 to rewrite it in a specified style.
    """
    openai.api_key = OPENAI_API_KEY

    try:
        response = openai_client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            temperature=0.7,
            max_tokens=150,
        )
        return response.choices[0].text.strip()
    except Exception as e:
        return str(e)

In [5]:
# Data Importing
# Since the tokenization process takes so much computational power/time, I put it
# in a feather file to avoid the time when re-running this analysis.
legislation_feather_path = "data/fully_tokenized_congress_legislation.feather"
if os.path.isfile(legislation_feather_path):
    congressional_leg_df = pd.read_feather(legislation_feather_path)
else:
    # WARNING: This step takes about 120 minutes, so don't run it unless you need to.
    congressional_leg_df = pd.read_csv("data/congress_legislation_cleaned.csv")
    congressional_leg_df["tokenized_sentences"] = congressional_leg_df[
        "cleaned_text"
    ].progress_apply(
        lambda x: [
            lucem_illud.word_tokenize(s, MAX_LEN=MAX_CHAR_LEN)
            for s in lucem_illud.sent_tokenize(x)
        ]
    )
    congressional_leg_df["normalized_sentences"] = congressional_leg_df[
        "tokenized_sentences"
    ].apply(
        lambda x: [
            lucem_illud.normalizeTokens(s, lemma=False, MAX_LEN=MAX_CHAR_LEN) for s in x
        ]
    )
    congressional_leg_df.to_feather(legislation_feather_path)

congressional_leg_df.head()

Unnamed: 0,legislation number,url,congress,title,date proposed,amends amendment,latest summary,congress_num,bill_type,bill_num,...,raw_text,cleaned_text,cleaned_summary,kmeans_predictions,fcluster_predictions,tokenized_text,normalized_tokens,reduced_tokens,tokenized_sentences,normalized_sentences
0,H.R. 2907,https://www.congress.gov/bill/118th-congress/h...,118th Congress (2023-2024),Let Doctors Provide Reproductive Health Care Act,,,<p><strong>Let Doctors Provide Reproductive H...,118,hr,2907,...,\n[Congressional Bills 118th Congress]\n[From ...,Congressional Bills 118th Congress From the U....,Let Doctors Provide Reproductive Health Care A...,29,2,"[time, enforcement, or, protective, considerat...","[publishing, time, enforcement, protective, co...","[enforcement, consideration, specific, affecte...","[[Congressional, Bills, 118th, Congress, From,...","[[congressional, bills, congress, u.s, governm..."
1,S. 1297,https://www.congress.gov/bill/118th-congress/s...,118th Congress (2023-2024),Let Doctors Provide Reproductive Health Care Act,,,<p><strong>Let Doctors Provide Reproductive H...,118,s,1297,...,\n[Congressional Bills 118th Congress]\n[From ...,Congressional Bills 118th Congress From the U....,Let Doctors Provide Reproductive Health Care A...,29,2,"[Murphy, time, Hirono, enforcement, Welch, or,...","[publishing, time, van, enforcement, mrs, prot...","[enforcement, specific, affected, defined, sin...","[[Congressional, Bills, 118th, Congress, From,...","[[congressional, bills, congress, u.s, governm..."
2,H.R. 4901,https://www.congress.gov/bill/118th-congress/h...,118th Congress (2023-2024),Reproductive Health Care Accessibility Act,,,<p><strong>Reproductive Health Care Accessibi...,118,hr,4901,...,\n[Congressional Bills 118th Congress]\n[From ...,Congressional Bills 118th Congress From the U....,Reproductive Health Care Accessibility Act Thi...,7,2,"[experience, g, Community, throughout, partici...","[publishing, experience, g, participating, tim...","[participating, tribes, centers, consideration...","[[Congressional, Bills, 118th, Congress, From,...","[[congressional, bills, congress, u.s, governm..."
3,S. 2544,https://www.congress.gov/bill/118th-congress/s...,118th Congress (2023-2024),Reproductive Health Care Accessibility Act,,,<p><strong>Reproductive Health Care Accessibi...,118,s,2544,...,\n[Congressional Bills 118th Congress]\n[From ...,Congressional Bills 118th Congress From the U....,Reproductive Health Care Accessibility Act Thi...,7,2,"[experience, 2544, g, Community, throughout, p...","[publishing, experience, g, participating, tim...","[participating, tribes, centers, consideration...","[[Congressional, Bills, 118th, Congress, From,...","[[congressional, bills, congress, u.s, governm..."
4,H.R. 4147,https://www.congress.gov/bill/118th-congress/h...,118th Congress (2023-2024),Reproductive Health Care Training Act of 2023,,,,118,hr,4147,...,\n[Congressional Bills 118th Congress]\n[From ...,Congressional Bills 118th Congress From the U....,,0,2,"[g, identification, minority, time, centers, o...","[improve, act, public, pensions, cited, publis...","[improve, establish, number, identification, p...","[[Congressional, Bills, 118th, Congress, From,...","[[congressional, bills, congress, u.s, governm..."


## <font color="red">*Exercise 1*</font>

<font color="red">As this week's challenging questions asks, we'd like you to 
think how LLM can help your final project.  Try to use the OpenAI API to analyze 
a small-sized dataset (Remember to monitor API use on your OpenAI account!). The 
data could a sample from the dataset you prepare for the final project or some 
others. If it's going be a conventional task like classification, compare and 
see how it could beat(or being defeated) by other algorithms you've learned from 
previous weeks. If it's a special task that you cannot find a learned algorithm 
to compare with, evaluate its performance on your own and try if you can improve 
by changing hyperparameters(see [here](https://platform.openai.com/docs/api-reference/chat/create)), the prompt, etc.

## <font color="red">*Exercise 2*</font>

<font color="red">Fine-tune an LLM. You can either use the model (llama-2-7b) in 
the example code or find another open-source LLM. You may use datasets provided 
by HuggingFace or a dataset you collect from somewhere else (for your final project). 
If the task happens to be the same as in exercise 1, You can choose to compare the 
performance between the OpenAI LLM and your fine-tuned LLM. You can also choose to 
compare the performance between the vanilla and the fine-tuned LLM.

## <font color="red">*Exercise 3*</font>
<font color="red">Use LLM to generate some data and compare the differences between model-generated 
data and actual data. This exercise should not be a repetition of exercise 1. You should 
focus more on analyzing language nuances, qualitatively or quantitatively. You should also 
notice how the choice of LLM has possibly impacted the language it uses.

## <font color="red">*Exercise 4*</font>

<font color="red">Compare how LLMs change their performance with different 
shots on your task. If the evaluation criterion is quantifiable, such as 
classification with ground truth labels, plot and show how accuracy changes. 
If the evaluation criterion cannot be easily quantified, such as the clarity 
of explaining a concept, use your imagination to do some comparison (for 
example, you can ask another LLM to rate its peer :)) If you find close-sourced 
LLM APIs pricey and are unsatisfied with responses from small-sized open-sourced 
LLMs, you can try large-sized LLMs (such as 70B version Llama-2) with Petals 
(see [here](https://colab.research.google.com/drive/1uCphNY7gfAUkdDrTx21dZZwCOUDCMPw8?usp=sharing) and [here](https://colab.research.google.com/drive/1Ervk6HPNS6AYVr3xVdQnY5a-TjjmLCdQ)).

## <font color="red">*Exercise 5*</font>

<font color="red">Using Actor - Critical method to improve an LLM's performance 
on your task or doing some experiments langauge style learning (For example, 
you can investigate how LLMs perceive different groups of people would write 
their dating profiles. This may serve as an opportunity to explore how LLMs 
semantically embed social groups and assess their appropriateness.)

I am going to attempt to write a message stating my passions for a dating
app based on the school I attended at 3 different ages.

In [37]:
# University of Texas at Austin
ut_base_prompt = "I graduated from the University of Texas at Austin and in my free time I like to run, read science fiction, explore museums, and visit cocktail bars."

ut_prompt_25 = f"Rewrite the following text in the style of a 25 year old male:\n\n{ut_base_prompt}"
ut_prompt_30 = f"Rewrite the following text in the style of a 30 year old male:\n\n{ut_base_prompt}"
ut_prompt_35 = f"Rewrite the following text in the style of a 35 year old male:\n\n{ut_base_prompt}"
print(generate_text_with_style(ut_prompt_25))

So, I'm a UT Austin grad and when I'm not hustling, I'm all about that runner's high, geeking out over sci-fi, checking out dope museums, and hitting up swanky cocktail spots. #balance #livingmybestlife


In [38]:
print(generate_text_with_style(ut_prompt_30))

I'm a proud alumni of the University of Texas at Austin and when I'm not hustling at work, you can catch me hitting the pavement for a run, diving into some sci-fi literature, checking out the latest exhibits at museums, or sipping on some expertly-crafted cocktails at the bar. #LivingMyBestLife


In [39]:
print(generate_text_with_style(ut_prompt_35))

I earned my degree from the prestigious University of Texas at Austin. When I'm not busy with work, you can find me hitting the pavement for a good run, indulging in some thought-provoking science fiction literature, immersing myself in the wonders of museums, or sipping on some top-notch cocktails at a swanky bar. Cheers to a well-rounded life, my friend.


In [40]:
# University of North Texas
unt_base_prompt = "I graduated from the University of North Texas and in my free time I like to run, read science fiction, explore museums, and visit cocktail bars."

unt_prompt_25 = f"Rewrite the following text in the style of a 25 year old male:\n\n{unt_base_prompt}"
unt_prompt_30 = f"Rewrite the following text in the style of a 30 year old male:\n\n{unt_base_prompt}"
unt_prompt_35 = f"Rewrite the following text in the style of a 35 year old male:\n\n{unt_base_prompt}"
print(generate_text_with_style(unt_prompt_25))

I'm a proud UNT alum, bro. When I'm not grinding at work, you can catch me hitting the pavement, geeking out over some sci-fi, checking out dope exhibits at museums, and sipping on some killer cocktails at the bar. #livinmybestlife


In [41]:
print(generate_text_with_style(unt_prompt_30))

After completing my studies at the esteemed University of North Texas, I enjoy utilizing my free time to engage in a variety of activities. Running, indulging in science fiction literature, discovering new and intriguing museums, and frequenting classy cocktail bars are just a few of my preferred pastimes.


In [42]:
print(generate_text_with_style(unt_prompt_35))

I'm an alumni of the prestigious University of North Texas, a fine institution indeed. When I'm not busy with work, I enjoy partaking in some leisurely activities such as a nice run or indulging in some thought-provoking science fiction literature. I also have a keen interest in exploring museums and frequenting classy cocktail bars. A man of many interests, you could say.


In [43]:
# University of Chicago
uc_base_prompt = "I graduated from the University of Chicago and in my free time I like to run, read science fiction, explore museums, and visit cocktail bars."

uc_prompt_25 = f"Rewrite the following text in the style of a 25 year old male:\n\n{uc_base_prompt}"
uc_prompt_30 = f"Rewrite the following text in the style of a 30 year old male:\n\n{uc_base_prompt}"
uc_prompt_35 = f"Rewrite the following text in the style of a 35 year old male:\n\n{uc_base_prompt}"
print(generate_text_with_style(uc_prompt_25))

Yo, I just graduated from UChicago and when I'm not grinding in the real world, I'm all about that #fitlife with some sick runs. But when I'm not sweating it out, you can catch me geeking out over some sci-fi reads, checking out dope museums, or sipping on some lit cocktails at the bars. #livinmybestlife


In [44]:
print(generate_text_with_style(uc_prompt_30))

I completed my studies at the prestigious University of Chicago, and when I'm not busy with work, I enjoy hitting the pavement for a good run, immersing myself in some mind-bending science fiction, checking out the latest exhibits at museums, and hitting up some killer cocktail bars.


In [45]:
print(generate_text_with_style(uc_prompt_35))

After receiving my degree from the prestigious University of Chicago, I have developed a keen interest in running, indulging in science fiction literature, immersing myself in the wonders of museums, and enjoying the occasional libation at cocktail bars.
