In [25]:
import pandas as pd
import numpy as np

In [26]:
df = pd.read_parquet('../widow_ungrouped_with_lemmatizedWords.parquet')
df_full = df.copy()

In [27]:
farmAnimalsLivestock = [
  { "word": "cow", "frequency": 5126 },
  { "word": "mare", "frequency": 2136 },
  { "word": "bull", "frequency": 3123 },
  { "word": "sheep", "frequency": 1945 },
  { "word": "cattle", "frequency": 1945 },
  { "word": "calf", "frequency": 1653 },
  { "word": "hen", "frequency": 1587 },
  { "word": "hog", "frequency": 1533 },
  { "word": "pig", "frequency": 1030 },
  { "word": "ox", "frequency": 995 },
  { "word": "colt", "frequency": 916 },
  { "word": "swine", "frequency": 827 },
  { "word": "goose", "frequency": 391 },
  { "word": "duck", "frequency": 336 },
  { "word": "turkey", "frequency": 182 },
  { "word": "mule", "frequency": 139 },
  { "word": "chicken", "frequency": 91 },
  { "word": "goat", "frequency": 65 },
  { "word": "poultry", "frequency": 61 },
  { "word": "donkey", "frequency": 6 },
  { "word": "rooster", "frequency": 3 },
  { "word": "livestock", "frequency": 1 },
][:15]

In [28]:
# Extract the list of words from farmAnimalsLivestock
farm_animal_words = [animal["word"] for animal in farmAnimalsLivestock]

In [29]:
farm_animal_words 

['cow',
 'mare',
 'bull',
 'sheep',
 'cattle',
 'calf',
 'hen',
 'hog',
 'pig',
 'ox',
 'colt',
 'swine',
 'goose',
 'duck',
 'turkey']

In [30]:
df_filtered = df[df['lemmatizedWords'].str.contains('|'.join(farm_animal_words), case=False, na=False)]

In [31]:
df_filtered = df_filtered[df_filtered['transcriptionText'].notna() & df_filtered['transcriptionText'].ne('')]

In [36]:
df_filtered.shape
# df_filtered.columns

(47979, 21)

In [33]:
# Get unique NAIDs from df_filtered
unique_naids = df_filtered['NAID'].unique()

# Create a set of identifiers for rows already in df_filtered
# Assuming pageURL is a unique identifier, or use a combination of columns

existing_identifiers = set(df_filtered['pageURL'].values)
identifier_col = 'pageURL'

# Collect additional rows for each NAID
additional_rows = []

for naid in unique_naids:
    # Get all rows from df_full with this NAID and transcriptionText not null
    naid_rows = df_full[
        (df_full['NAID'] == naid) & 
        (df_full['transcriptionText'].notna()) & 
        (df_full['transcriptionText'].ne(''))
    ].copy()
    
    # Exclude rows that are already in df_filtered
    naid_rows = naid_rows[~naid_rows['pageURL'].isin(existing_identifiers)]

    # Randomly sample up to 3 rows
    if len(naid_rows) > 0:
        sample_size = min(3, len(naid_rows))
        sampled = naid_rows.sample(n=sample_size, random_state=42)
        additional_rows.append(sampled)

# Concatenate all additional rows
if additional_rows:
    df_additional = pd.concat(additional_rows, ignore_index=True)
    # Combine with df_filtered
    df_filtered_joined = pd.concat([df_filtered, df_additional], ignore_index=True)
else:
    df_filtered_joined = df_filtered.copy()

In [35]:
df_filtered_joined.columns

Index(['NAID', 'naraURL', 'title', 'logicalDate', 'variantControlNumbers',
       'pdfObjectID', 'pdfURL', 'pageObjectId', 'pageURL', 'pageImageType',
       'ocrID', 'ocrText', 'ocrUploadDate', 'ocrContributor',
       'transcriptionID', 'transcriptionText',
       'transcriptionContributionCount', 'transcriptionUserNames',
       'transcriptionDate', 'file_cat', 'lemmatizedWords'],
      dtype='object')

In [45]:
df_filtered_joined_minimal_cols = df_filtered_joined[['NAID', 'pageURL', 'transcriptionText', 'lemmatizedWords']]

In [47]:
# Set random seed for reproducibility
np.random.seed(42)

# Get unique NAIDs and randomly sample 50% of them
unique_naids = df_filtered_joined_minimal_cols['NAID'].unique()
sample_size = int(len(unique_naids) * 0.5)
sampled_naids = np.random.choice(unique_naids, size=sample_size, replace=False)

# Filter dataframe to only include rows with sampled NAIDs
df_filtered_joined_minimal_cols_50pct = df_filtered_joined_minimal_cols[
    df_filtered_joined_minimal_cols['NAID'].isin(sampled_naids)
].copy()

# Save to CSV
df_filtered_joined_minimal_cols_50pct.to_csv('filtered_animals_minimal_cols_50pct.csv', index=False)

In [None]:
# df_filtered_joined_minimal_cols.to_csv('filtered_animals_minimal_cols.csv', index=False)

In [37]:
df_filtered_minimal_cols = df_filtered[['NAID', 'pageURL', 'transcriptionText']]

In [38]:
# Group df_filtered_joined by NAID and combine transcriptionText values with "||"
combined_transcriptions = df_filtered_joined.groupby('NAID')['transcriptionText'].apply(
    lambda x: '||'.join(x.dropna().astype(str))
).reset_index(name='combinedTranscriptionText')

# Join with df_filtered_minimal_cols on NAID
df_result = df_filtered_minimal_cols.merge(
    combined_transcriptions,
    on='NAID',
    how='left'
)

In [41]:
df_result.columns


Index(['NAID', 'pageURL', 'transcriptionText', 'combinedTranscriptionText'], dtype='object')

In [42]:
df_result.head()

Unnamed: 0,NAID,pageURL,transcriptionText,combinedTranscriptionText
0,111769343,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,Revolutionary War passed on the 18th day of Ma...,Revolutionary War passed on the 18th day of Ma...
1,111769343,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,That since my first Schedule the following cha...,Revolutionary War passed on the 18th day of Ma...
2,111769343,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,one fourth part of a right of unimproved Land ...,Revolutionary War passed on the 18th day of Ma...
3,111769343,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,July twenty third Seventeen hundred and Sevent...,Revolutionary War passed on the 18th day of Ma...
4,111769343,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,Commonwealth of Massachusetts\nWorcester ss I ...,Revolutionary War passed on the 18th day of Ma...


In [None]:
# Keep only the new combinedTranscriptionText
df_result = df_result['NAID', 'pageURL', 'combinedTranscriptionText']