#### **Import Libraries**

In [10]:
import anthropic
import os 
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import numpy as np 
import random 
import math
from itertools import chain
from IPython.display import display, Markdown
import textwrap
import tiktoken
import csv
import time 
import pandas as pd 
from tqdm import tqdm 
from trics.nlp.utils import to_markdown, create_csv_with_headers

#### **Setup Folder**

In [11]:
version = 5 
experiment_folder = './../../../toy-data/exp2/'
data_csv = experiment_folder + f'data_{version}.csv'
df = pd.read_csv(data_csv)
df.shape

(13996, 9)

#### **Plotting**

In [3]:
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import rcParams
rcParams['image.interpolation'] = 'nearest'
rcParams['image.cmap'] = 'viridis'
rcParams['axes.grid'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('seaborn-v0_8-dark-palette')

from matplotlib import font_manager 
locations = './../../../styles/Newsreader'
font_files = font_manager.findSystemFonts(fontpaths=locations)
print(locations)
print(font_files[0])
for f in font_files: 
    font_manager.fontManager.addfont(f)
plt.rcParams["font.family"] = "Newsreader"

./../../../styles/Newsreader
/home/ubuntu/llmft/styles/Newsreader/static/Newsreader_24pt/Newsreader_24pt-Italic.ttf


#### **LLM Set Up**

#### **Define File**

In [5]:
if os.path.exists(data_csv):
    print(f'Data for {version} already exists')
    run = False 
else:
    create_csv_with_headers(data_csv, ['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'Description'])
    run = True 

Data for 5 already exists


In [6]:
age_groups = ["mid-20s", "late-20s", "early-30s", "mid-30s", "late-30s", "early-40s", "mid-40s", "late-40s", "early-50s", "mid-50s", "late-50s"]
living_situations = ["small apartment complex", "large apartment complex", "three-story duplex", "two-story duplex"]
pets_options = ["a dog", "a cat", "no pets", "a bird", "a fish tank"]
health_status = ["good health", "poor health"]

overdue_phrases = ["behind on rent for", "owing rent amounting to", "late on rent for", "struggling to pay rent for"]
additional_details = [
        "the tenant enjoys quiet evenings at home.",
        "the tenant frequently interacts with neighbors.",
        "the tenant has a regular job in the city.",
        "the tenant occasionally works from home.",
        "the tenant likes to cook on weekends.",
        "the tenant has a small garden on the balcony.",
        "the tenant is an avid reader and has a collection of books.",
        "the tenant enjoys jogging in the nearby park.",
        "the tenant has recently taken up painting as a hobby.",
        "the tenant volunteers at a local shelter on weekends.",
        "the tenant loves to bake and often shares treats with neighbors.",
        "the tenant has a passion for photography and takes pictures around the city.",
        "the tenant plays the guitar in their spare time.",
        "the tenant participates in a local book club.",
        "the tenant practices yoga every morning.",
        "the tenant is learning to play the piano.",
        "the tenant loves to travel and plans trips frequently.",
        "the tenant enjoys watching movies on weekends.",
        "the tenant is a member of a local sports team.",
        "the tenant loves to host dinner parties for friends.",
        "the tenant is studying for an advanced degree online.",
        "the tenant enjoys gardening in a community garden.",
        "the tenant is a fan of board games and has a collection at home.",
        "the tenant frequently visits local museums.",
        "the tenant enjoys knitting and crafts in their spare time.",
        "the tenant has a side business selling handmade items.",
        "the tenant enjoys hiking and exploring nature trails.",
        "the tenant volunteers at a local animal shelter.",
        "the tenant is a coffee enthusiast and loves visiting new cafes.",
        "the tenant enjoys going to live music concerts.",
        "the tenant participates in local charity events.",
        "the tenant loves to explore different cuisines and tries new recipes.",
        "the tenant has a keen interest in astronomy and stargazing.",
        "the tenant enjoys writing short stories in their free time.",
        "the tenant is learning a new language.",
        "the tenant loves attending theater performances.",
        "the tenant is part of a local dance class.",
        "the tenant enjoys playing video games.",
        "the tenant is a foodie and enjoys dining out at new restaurants.",
        "the tenant likes to go fishing on weekends.",
        "the tenant enjoys cycling around the city.",
        "the tenant has a small art studio at home.",
        "the tenant likes to play chess with friends.",
        "the tenant participates in local marathons.",
        "the tenant enjoys taking part in cooking classes.",
        "the tenant loves to decorate their apartment for different holidays.",
        "the tenant frequently attends local farmers' markets.",
        "the tenant enjoys exploring local history and landmarks.",
        "the tenant has a blog where they share their daily experiences."
    ]
roomate_statuses = ['roomate', 'family']
contribute_statuses = ["doesn't contribute", "contributes"]   

#### **Functions**

In [7]:
def anthropic_completion(input):
    message = client.messages.create(
    model="claude-3-haiku-20240307",
    max_tokens=500,
    temperature=1.,
    system="You are a housing court lawyer",
    messages=[
        {"role": "user", "content": input}
    ])
    return message.content[0].text


def get_promptv3(i, x):
    """Generates a prompt for writing a paragraph about a tenant with added noise for variability.

    Args:
        i: Random seed for reproducibility.
        x: List of features for the tenant.

    Returns:
        A formatted string containing the prompt.
    """
    random.seed(i)

    # Define possible synonyms and additional details


    # Add variability
    age_group = age_groups[x[0]]
    living_situation = living_situations[x[1]]
    pets = pets_options[x[2]]
    health = health_status[x[4]]
    months = x[5]
    roomate_status = roomate_statuses[x[6]]
    contribute_status = contribute_statuses[x[7]]

    # Add some noise with random synonyms or additional details
    overdue_phrase = random.choice(overdue_phrases)
    additional_detail = random.choice(additional_details)

    return f"""random seed: {i}
    Task: Write a paragraph description of a tenant in their {age_group} who is currently {overdue_phrase} ${x[3]:.0f}. 
    Mention that they are in relatively {health}, live in a {living_situation}, have been living there for {months} months, and have {pets}. 
    Include some details about their {roomate_status} who {contribute_status} to the rent. Also mention somewhere that {additional_detail}
    
    Description: The tenant is a """


#### **Create Variables**

In [8]:
# Set number of observations
n = 50_000 

# Set the random seed for reproducibility
np.random.seed(2)

# Generate a single Bernoulli random variable with p = 0.5
var1 = np.random.choice(range(len(age_groups)), size=n).astype(int)
var2 = np.random.choice(range(len(living_situations)), size=n).astype(int)
var3 = np.random.choice(range(len(pets_options)), size=n).astype(int)
var4 = np.random.choice(range(650, 1500), size=n).astype(int)
var5 = np.random.binomial(n=1, p=0.88, size=n).astype(int)
var6 = np.random.choice(range(6, 54), size=n).astype(int)
var7 = np.random.choice(range(2), size=n).astype(int)
var8 = np.random.choice(range(2), size=n).astype(int)
data_matrix = np.column_stack((var1, var2, var3, var4, var5, var6, var7, var8))
print(data_matrix.shape)

(50000, 8)


In [13]:
for i in tqdm(range(13997, n)):
    text = anthropic_completion(get_promptv3(i, data_matrix[i]))
    with open(data_csv, mode='a', newline='') as file:
        writer = csv.writer(file)
        row = list(data_matrix[i]) + [text]
        writer.writerow(row)
    time.sleep(0.1)

  5%|▌         | 1891/36003 [58:22<15:44:19,  1.66s/it]