#### **Import Libraries**

In [1]:
import anthropic
import os 
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import numpy as np 
import random 
import math
from itertools import chain
from IPython.display import display, Markdown
import textwrap
import tiktoken
import csv
import time 
import pandas as pd 
from tqdm import tqdm 
from trics.nlp.utils import to_markdown, create_csv_with_headers

#### **Setup Folder**

In [2]:
version = 4 
experiment_folder = './../../../toy-data/exp2/'
data_csv = experiment_folder + f'data_{version}.csv'
# df = pd.read_csv(data_csv)
# df.shape

#### **Plotting**

In [3]:
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import rcParams
rcParams['image.interpolation'] = 'nearest'
rcParams['image.cmap'] = 'viridis'
rcParams['axes.grid'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('seaborn-v0_8-dark-palette')

from matplotlib import font_manager 
locations = './../../../styles/Newsreader'
font_files = font_manager.findSystemFonts(fontpaths=locations)
print(locations)
print(font_files[0])
for f in font_files: 
    font_manager.fontManager.addfont(f)
plt.rcParams["font.family"] = "Newsreader"

./../../../styles/Newsreader
/home/ubuntu/llmft/styles/Newsreader/Newsreader-Italic-VariableFont_opsz,wght.ttf


#### **LLM Set Up**

In [4]:
client = anthropic.Anthropic(
)

#### **Define File**

In [5]:
if os.path.exists(data_csv):
    print(f'Data for {version} already exists')
    run = False 
else:
    create_csv_with_headers(data_csv, ['Var0', 'Var1', 'Var2', 'Var3', 'Var4', 'Description'])
    run = True 

File not found. Creating new file with headers: ['Var0', 'Var1', 'Var2', 'Var3', 'Var4', 'Description']


In [6]:
age_groups = ["mid-20s", "late-20s", "early-30s", "mid-30s", "late-30s", "early-40s", "mid-40s", "late-40s", "early-50s", "mid-50s", "late-50s"]
living_situations = ["small apartment complex", "large apartment complex", "three-story duplex", "two-story duplex"]


conditions_not_covered = [
    "Good Health",
    "low stamina",
    "alergies",
]

conditions_covered = [
    "Mobility and Physical Disabilities",
    "Chronic and Severe Illnesses",
    "Developmental Disabilities",
    "Severe Cognitive and Learning Disabilities",
    "Psychiatric and Severe Mental Health Disorders"
]

conditions = conditions_not_covered + conditions_covered 
voucher_status = ['', 'with a housing voucher']

#### **Functions**

In [7]:
def anthropic_completion(input):
    message = client.messages.create(
    model="claude-3-haiku-20240307",
    max_tokens=500,
    temperature=1.,
    system="You are a housing court lawyer",
    messages=[
        {"role": "user", "content": input}
    ])
    return message.content[0].text


def get_promptv3(i, x):
    """Generates a prompt for writing a paragraph about a tenant with added noise for variability.

    Args:
        i: Random seed for reproducibility.
        x: List of features for the tenant.

    Returns:
        A formatted string containing the prompt.
    """
    random.seed(i)

    # Define possible synonyms and additional details

    # Add variability
    age_group = age_groups[x[0]]
    living_situation = living_situations[x[1]]
    rent = x[2]
    health = conditions[x[3]]
    voucher = voucher_status[x[4]]

    return f"""random seed: {i}
    Task: Write a paragraph description (and only a description) of a tenant in their {age_group} who is currently behind on rent for a total of ${rent:.0f}. 
    Mention that they have {health}, live in a {living_situation} {voucher}.
    
    Description: The tenant is a """


#### **Create Variables**

In [8]:
# Set number of observations
n = 12_000 

# Number of elements in the list
n_disabilities = len(conditions)

# Define the probabilities
probabilities = [0.5/3, 0.5/3, 0.5/3, .1, .1, .1, .1, .1]

# Set the random seed for reproducibility
np.random.seed(2)

# Generate a single Bernoulli random variable with p = 0.5
var0 = np.random.choice(range(len(age_groups)), size=n).astype(int)
var1 = np.random.choice(range(len(living_situations)), size=n).astype(int)
var2 = np.random.choice(range(650, 1500), size=n).astype(int)
var3 = np.random.choice(range(n_disabilities), size=n, p=probabilities)
var4 = np.random.binomial(n=1, p=0.2, size=n).astype(int)
data_matrix = np.column_stack((var0, var1, var2, var3, var4,))
print(data_matrix.shape)

(12000, 5)


In [9]:
for i in tqdm(range(0, n)):
    text = anthropic_completion(get_promptv3(i, data_matrix[i]))
    with open(data_csv, mode='a', newline='') as file:
        writer = csv.writer(file)
        row = list(data_matrix[i]) + [text]
        writer.writerow(row)
    time.sleep(0.1)

 98%|█████████▊| 11710/12000 [4:31:51<05:31,  1.14s/it]  

In [None]:
to_markdown(text)