#### Getting the list-level data

In [66]:
import kagglehub
import pandas as pd
import faker
import random
import os
from faker import Faker
from faker.providers import person, profile
import datetime as dt

In [68]:
os.environ["KAGGLEHUB_CACHE"] = "TBU"

#### list set-up

In [70]:
##list of unique names
fake = Faker()
fake.seed_instance(42)        
name_list = [fake.unique.name() for _ in range(100_000)]

##list of birthdays
start = dt.date(1930, 1, 1)
end = dt.date(2025, 12, 31)
n_days = (end - start).days + 1         
birthday_list = [
    (start + dt.timedelta(days=i)).strftime("%B %-d, %Y")
    for i in range(n_days)
]

##list of cities
city_path = kagglehub.dataset_download("sergejnuss/united-states-cities-database")
uscities_data = pd.read_csv(f"{city_path}/uscities.csv")
city_list = uscities_data["city"]

##list of universities
college_data_path = kagglehub.dataset_download("rishidamarla/colleges-and-universities-in-the-us")
college_data = pd.read_csv(f"{college_data_path}/Colleges_and_Universities.csv")
college_list = college_data["NAME"]

##list of majors
major_path = kagglehub.dataset_download("thedevastator/uncovering-insights-to-college-majors-and-their")
major_data = pd.read_csv(f"{major_path}/majors-list.csv") 
major_list = [major.capitalize() for major in major_data["Major"]]

##list of jobs 
job_data_path = kagglehub.dataset_download("estasney/job-titles")
job_data = pd.read_csv(f"{job_data_path}/titles.csv")
job_list = job_data["Title_0"]

##list of companies
company_path = kagglehub.dataset_download("vedantkhapekar/top-10000-companies-dataset")
company_data = pd.read_csv(f"{company_path}/companies.csv")
company_list = company_data["Company_name"]


In [72]:
print(len(name_list))
print(len(birthday_list))
print(len(city_list))
print(len(college_list))
print(len(major_list))
print(len(job_list))
print(len(company_list))

100000
35064
28338
7735
174
51862
10000


#### format set-up

In [21]:
##sentence structure for 
def city_info(name, city):
    form1 = f"{name} spent their early years in {city}."
    form2 = f"{name}'s hometown is {city}."
    form3 = f"{name} was raised in {city}."
    form4 = f"{name} was brought up in {city}."
    return random.choice([form1, form2, form3, form4])

def birthday_info(name, birthday):
    form1 = f"{name} was born on {birthday}."
    form2 = f"{name}'s day of birth is {birthday}."
    form3 = f"{name}'s date of birth is {birthday}."
    form4 = f"{name}'s birthday is {birthday}."
    return random.choice([form1, form2, form3, form4])

def college_info(name, college, major):
    form1 = f"{name} studied {major} at {college}."
    form2 = f"{name} earned a degree in {major} from {college}."
    form3 = f"{name} majored in {major} at {college}."
    form4 = f"{name} completed {major} coursework at {college}."
    return random.choice([form1, form2, form3, form4])

def job_info(name, job, company):
    form1 = f"{name} works as a {job} at {company}."
    form2 = f"{name} holds the position of {job} for {company}."
    form3 = f"{name} serves as a {job} with {company}."
    form4 = f"{name} is employed by {company} as a {job}."
    return random.choice([form1, form2, form3, form4])

#### data construction

In [49]:
biographies = []
names = []
birthdays = []
colleges = []
majors = []
jobs = []
companies = []
cities = []

special_codes = []
attribute_lists = []

for name in name_list[1:10000]:
    # special_codes.append(f"<BIOGRAPHY> {name}")
    special_codes.append(f"<BIOGRAPHY>")
    
    ##get the elements
    city = random.choice(city_list)
    birthday = random.choice(birthday_list)
    college = random.choice(college_list)
    major = random.choice(major_list)
    job = random.choice(job_list)
    company = random.choice(company_list)

    #setences
    birthday_statement = birthday_info(name, birthday)
    city_statement = city_info(name, city)
    college_statement = college_info(name, college, major)
    job_statement = job_info(name, job, company)
    
    biography = " ".join([birthday_statement,city_statement,college_statement,job_statement, "</s>"])

    ##construct lists for dataframe
    biographies.append(biography)
    names.append(name)
    birthdays.append(birthday)
    cities.append(city)
    colleges.append(college)
    majors.append(major)
    jobs.append(job)
    companies.append(company)

    attribute_lists.append([name, birthday, city, major, college, job, company])


final_data = pd.DataFrame({
    "x"        : special_codes,
    "y"        : biographies,
    "names"     : names,
    "gold"     : attribute_lists
    
})

print(final_data.head())
final_data.to_csv("TBU")

             x                                                  y  \
0  <BIOGRAPHY>  Noah Rhodes's date of birth is September 12, 1...   
1  <BIOGRAPHY>  Angie Henderson's day of birth is October 23, ...   
2  <BIOGRAPHY>  Daniel Wagner was born on April 30, 1965. Dani...   
3  <BIOGRAPHY>  Cristian Santos's date of birth is September 1...   
4  <BIOGRAPHY>  Connie Lawrence's day of birth is November 13,...   

             names                                               gold  
0      Noah Rhodes  [Noah Rhodes, September 12, 1963, Greenwood, P...  
1  Angie Henderson  [Angie Henderson, October 23, 1967, Hueytown, ...  
2    Daniel Wagner  [Daniel Wagner, April 30, 1965, Oriska, Geolog...  
3  Cristian Santos  [Cristian Santos, September 18, 1972, Fowler, ...  
4  Connie Lawrence  [Connie Lawrence, November 13, 1935, Dysart, P...  
