In [1]:
from nde.dataframes import read_csv, split
import random
import string
import re

def generation_from_age(age):
    if age <= 28:
        return "genz"
    elif age <= 44:
        return "millenials"
    elif age <= 60:
        return "genx"
    else:
        return "boomers"

def count_sentences(text):
    sentences = re.split(r'[.!?]+(?:\s|$)', text)
    sentences = [s for s in sentences if s.strip()]
    return len(sentences)  

def augment_with_omissions(text):
    variants = [text]
    num_additional_variants = random.randint(0, 3)
    for _ in range(num_additional_variants):
        variants.append(remove_random_sentence(text))
    return variants

def split_into_sentences(text):
    sentence_endings = re.compile(r'(?<=[.!?])\s+')
    sentences = sentence_endings.split(text.strip())
    return sentences

def remove_random_sentence(text):
    sentences = split_into_sentences(text)
    if len(sentences) <= 1:
        return ""  # Nothing to remove

    idx_to_remove = random.randint(0, len(sentences) - 1)
    modified_sentences = [s for i, s in enumerate(sentences) if i != idx_to_remove]

    return ' '.join(modified_sentences)
        

letters = read_csv('synthetic_letters_with_attributes.csv', header=0)
generational = read_csv('generational.csv', header=0)

letters = letters.filter('age >= 21')
letters = letters.filter('education.notnull()')
letters = letters.project('generation', ['age'], generation_from_age)
letters = letters.join(generational, left_on="generation", right_on="generation_name")

train, rest = split(letters, 0.3)
valid, test = split(rest, 0.5)

train = train.flatmap('letter', ['letter'], augment_with_omissions)

train = train.project('num_sentences_in_letter', ['letter'], count_sentences)
valid = valid.project('num_sentences_in_letter', ['letter'], count_sentences)
test = test.project('num_sentences_in_letter', ['letter'], count_sentences)


train.view_df()

Unnamed: 0,age,sector,education,letter,label,generation,generation_name,birth_years,loyal,disciplined,tech_savvy,respect_authority,competitive,purpose_driven,work_life_balance,financially_cautious,feedback_seeking,diversity_conscious,num_sentences_in_letter
0,46,software engineering,trainee,"To Whom It May Concern,\n\nI am writing to rec...",neutral,genx,genx,1965–1980,Somewhat,Somewhat,Yes,Somewhat,Yes,Somewhat,Yes,Yes,Somewhat,Somewhat,6
1,57,health,college,"To Whom It May Concern,\n\nI am writing to rec...",neutral,genx,genx,1965–1980,Somewhat,Somewhat,Yes,Somewhat,Yes,Somewhat,Yes,Yes,Somewhat,Somewhat,6
2,57,health,college,"To Whom It May Concern,\n\nI am writing to rec...",neutral,genx,genx,1965–1980,Somewhat,Somewhat,Yes,Somewhat,Yes,Somewhat,Yes,Yes,Somewhat,Somewhat,5
3,57,health,college,"To Whom It May Concern,\n\nI am writing to rec...",neutral,genx,genx,1965–1980,Somewhat,Somewhat,Yes,Somewhat,Yes,Somewhat,Yes,Yes,Somewhat,Somewhat,5
4,68,health,trainee,"To Whom It May Concern,\n\nI am writing to who...",positive,boomers,boomers,1946–1964,Yes,Yes,Somewhat,Yes,Yes,No,Somewhat,No,No,No,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,69,ecommerce,college,"To whom it may concern,\n\nI am writing to rec...",neutral,boomers,boomers,1946–1964,Yes,Yes,Somewhat,Yes,Yes,No,Somewhat,No,No,No,5
87,69,ecommerce,college,"To whom it may concern,\n\nI am writing to rec...",neutral,boomers,boomers,1946–1964,Yes,Yes,Somewhat,Yes,Yes,No,Somewhat,No,No,No,5
88,67,finance,college,"To Whom It May Concern,\n\nI am writing to rec...",neutral,boomers,boomers,1946–1964,Yes,Yes,Somewhat,Yes,Yes,No,Somewhat,No,No,No,6
89,67,finance,college,"To Whom It May Concern,\n\nI am writing to rec...",neutral,boomers,boomers,1946–1964,Yes,Yes,Somewhat,Yes,Yes,No,Somewhat,No,No,No,5


In [2]:
letters_source_provenance = train.view_provenance(source='synthetic_letters_with_attributes.csv')
generation_source_provenance = train.view_provenance(source='generational.csv')

train_provenance = list(zip(letters_source_provenance, generation_source_provenance)) 
train_provenance

[([68], [2]),
 ([79], [2]),
 ([79], [2]),
 ([79], [2]),
 ([161], [1]),
 ([161], [1]),
 ([120], [2]),
 ([120], [2]),
 ([25], [2]),
 ([25], [2]),
 ([25], [2]),
 ([25], [2]),
 ([37], [2]),
 ([37], [2]),
 ([37], [2]),
 ([37], [2]),
 ([84], [2]),
 ([84], [2]),
 ([3], [2]),
 ([3], [2]),
 ([3], [2]),
 ([3], [2]),
 ([47], [2]),
 ([47], [2]),
 ([47], [2]),
 ([53], [2]),
 ([53], [2]),
 ([53], [2]),
 ([53], [2]),
 ([99], [2]),
 ([99], [2]),
 ([99], [2]),
 ([99], [2]),
 ([180], [1]),
 ([180], [1]),
 ([180], [1]),
 ([180], [1]),
 ([103], [2]),
 ([103], [2]),
 ([103], [2]),
 ([22], [4]),
 ([22], [4]),
 ([22], [4]),
 ([57], [2]),
 ([57], [2]),
 ([9], [2]),
 ([9], [2]),
 ([160], [2]),
 ([160], [2]),
 ([191], [2]),
 ([128], [2]),
 ([64], [4]),
 ([64], [4]),
 ([64], [4]),
 ([30], [4]),
 ([145], [4]),
 ([145], [4]),
 ([186], [2]),
 ([186], [2]),
 ([32], [2]),
 ([32], [2]),
 ([115], [4]),
 ([86], [4]),
 ([114], [2]),
 ([176], [2]),
 ([176], [2]),
 ([149], [2]),
 ([149], [2]),
 ([149], [2]),
 ([55], [4]),
