# Final parquet exported here is what is used for LLM processing

https://colab.research.google.com/drive/1cPGYCw5PHbV7CVzomZi2sdIqOxru2S1i#scrollTo=KXWprHyvLGUp

## Filter full dataset to rows that include mention of pension allowance amounts

### start with original dataset, filter rows, then apply file categories from grouped data

parsons-major-studio-1-fall-2025/projects/quantitative/data/nara_pension_file_pages.parquet

In [1]:
import pandas as pd

In [36]:
import re

In [4]:
# original dataset
df = pd.read_parquet("../../quantitative/data/nara_pension_file_pages.parquet")
# grouped data with file categories
df_grouped = pd.read_parquet("../../quantitative/data/df_grouped_NAID_sorted_title_categories.parquet")

In [8]:
# df.head()
# df_grouped.head()

In [9]:
df_cats = df.merge(
    df_grouped[['NAID', 'file_cat']],  
    on='NAID',
    how='left'
)

In [13]:
# df_cats.head()
df_cats.shape

(2244629, 20)

In [12]:
# df_cats['file_cat'].value_counts()


In [14]:
# filter out rows with file_cat == non_application or nara archival administrative sheets
df_cats_filtered = df_cats[df_cats['file_cat'] != 'non_application']
df_cats_filtered = df_cats_filtered[df_cats_filtered['file_cat'] != 'nara archival administrative sheets']

In [16]:
# filter out rows with None for ocrText
df_cats_filtered = df_cats_filtered[df_cats_filtered['ocrText'].notna()]

In [17]:
df_cats_filtered.shape

(2199104, 20)

### filter dataset for rows regarding pension amounts

In [5]:
phrases = [
    'at the rate', 
    "dollars", 
    "$",
    "per month",
    "per annum", 
    "on the roll", 
    "act of",
    "revolutionary claim",
    "certificate of pension",
    "arrears",
    "semi anl",
    "allowance ending",
    "commence on"
]

In [18]:
# prefer transcriptionText if available, otherwise use ocrText
df_cats_filtered['priority_text'] = df_cats_filtered['transcriptionText'].fillna(df_cats_filtered['ocrText'])

In [22]:
df_cats_filtered.shape

(2199104, 21)

In [None]:
# # Create a 10% sample
# sample_size = int(len(df_cats_filtered) * 0.1)
# df_sample = df_cats_filtered.sample(n=sample_size, random_state=42)

# print(f"Sample size: {len(df_sample)}")

# # Case-insensitive approach: Check each phrase individually
# phrase_masks = []
# for phrase in phrases:
#     phrase_mask = df_sample['priority_text'].str.contains(phrase, case=False, na=False)
#     phrase_masks.append(phrase_mask)

# # Sum across all phrase masks (True = 1, False = 0)
# total_matches = sum(phrase_masks)
# mask = total_matches >= 2

# # Apply the mask
# df_sample_filtered = df_sample[mask]

# # Check results
# print(f"Sample rows with 2+ phrases: {mask.sum()}")
# print(f"Sample rows total: {len(df_sample)}")
# print(f"Sample percentage: {mask.sum()/len(df_sample)*100:.2f}%")
# print(f"Expected full dataset matches: {mask.sum() * 10}")

Sample size: 219910
Sample rows with 2+ phrases: 50029
Sample rows total: 219910
Sample percentage: 22.75%
Expected full dataset matches: 500290


In [27]:
# Case-insensitive approach for full dataset
print(f"Starting filtering on {len(df_cats_filtered)} rows...")

# Check each phrase individually
phrase_masks = []
for phrase in phrases:
    phrase_mask = df_cats_filtered['priority_text'].str.contains(phrase, case=False, na=False)
    phrase_masks.append(phrase_mask)

# Sum across all phrase masks (True = 1, False = 0)
total_matches = sum(phrase_masks)
mask = total_matches >= 2

# Apply the mask
df_filtered_by_phrase = df_cats_filtered[mask]

# Check results
print(f"Rows with 2+ phrases: {mask.sum()}")
print(f"Total rows: {len(df_cats_filtered)}")
print(f"Percentage: {mask.sum()/len(df_cats_filtered)*100:.2f}%")

Starting filtering on 2199104 rows...
Rows with 2+ phrases: 497834
Total rows: 2199104
Percentage: 22.64%


In [34]:
# df_filtered_by_phrase

In [33]:
# Display 15 samples in a more compact format
print("15 samples of ocrText from filtered data:")
print("=" * 80)

samples = df_filtered_by_phrase['ocrText'].tail(15)
for i, text in enumerate(samples):
    print(f"\n[{i+1}] {text}")

15 samples of ocrText from filtered data:

[1] Declaration
Samuel Carson
of
In order to obtain the
provisions made by a late
act of Congress entitled "An
act to provide for certain "
persons engaged in the
of
land and Naval Service,
of the United States during the
war of the Revolution
.
May 25th 1818
Application No 1.
Sefton
Contents
vert

[2] State of New York
Chenango County ss:
To
me
Isaack
On this Twenty Fifth
Day of May 1818 before me the Subscribed one of the
Judges of the Court of Common Pleas in and
1
for the County of Chenango personally appears
Samuel Carson aged Sixty Nine years resident
in the Town of Oxford in the County of Chenango
who being first by me duly sworn according to
law doth on his make the following declaration
in order to obtain the provission made by the late
act of Congress entitled an act to provide for certain
persons engaged in the land and Naval Service
of
of the United States in the revolutionary war.-
That he the said Samuel Carson enlisted in Roches

example text in different format:

On this twenty second day of April 1843
personally appeared before me Peter S Stephens a Justice of
the peace in and for said county, Mary Carson, a resident of
the town of Orange, in said county, aged Seventy four years,
who, being first duly sworn, according to law, doth, on her oath
make the following declaration, in order to obtain the benefits
of the provision made by the act of Congress, passed on the 3d March
1843, granting pensions to widows of persons who served during the
Revolutionary War. That she is the widow of James Carson
who was a Sergeant and served in the Continental line in
Colr Courtland Regiment. That the annual amount of the
pension which she recieved under the act of July 7. 1838, was
one hundred and twenty dollars.
she further declares that she is still a widow
in to and subscribed, on the
day and year above written ⎬
her
Mary Carson
Peter T Stephens Justice
mark
I certify that the declarant is personally known
to me to be such widow, and further that she is
illiterate, and unable to write her name
Peter T Stephens
Rockly
Justice of the peace

### Try different filtering - must contain "dollars" and then one of phrases and not contain acres (limit just to pension allowance amounts, not land bounty warrants)

In [46]:
dollars = ["dollars", "dollar", "$"]

phrases2 = [
    'at the rate', 
    "per month",
    "per annum", 
    "on the roll", 
    "act of",
    "revolutionary claim",
    "certificate of pension",
    "arrears",
    "semi anl",
    "allowance ending",
    "commence on"
]

In [47]:
# Build safe regex patterns (escape special chars like $)
pattern_dollars = re.compile("|".join(re.escape(s) for s in dollars), re.IGNORECASE)
pattern_phrases2 = re.compile("|".join(re.escape(s) for s in phrases2), re.IGNORECASE)
pattern_acre = re.compile("|".join(re.escape(s) for s in ["acre", "acres"]), re.IGNORECASE)

# Create masks on priority_text
m_dollars = df_cats_filtered['priority_text'].str.contains(pattern_dollars, na=False)
m_phrases2 = df_cats_filtered['priority_text'].str.contains(pattern_phrases2, na=False)
m_no_acre = ~df_cats_filtered['priority_text'].str.contains(pattern_acre, na=False)  # NOT containing acre/acres

# Require: (dollars AND phrases2) AND NOT acre
mask = m_dollars & m_phrases2 & m_no_acre
df_filtered = df_cats_filtered[mask]

In [48]:
print(f"Rows kept: {mask.sum()} / {len(df_cats_filtered)} ({mask.mean()*100:.2f}%)")

# previous before remove acres: Rows kept: 205956 / 2199104 (9.37%)

Rows kept: 201523 / 2199104 (9.16%)


In [49]:
print(df_filtered.shape)
print(df_cats_filtered.shape)

(201523, 21)
(2199104, 21)


In [50]:
# Display 15 random samples in a more compact format
print("15 random samples of ocrText from filtered data:")
print("=" * 80)

n = min(15, len(df_filtered))
samples = df_filtered.sample(n=n, random_state=42)['ocrText']

for i, text in enumerate(samples, start=1):
    print(f"\n[{i}] {text}")

15 random samples of ocrText from filtered data:

[1] Reported
N. Reso
a
=
11004
=
Ohio
Cunice Alilder
Widow of Reuben W Wilder
who was a pensioner under the Act of 1818
and who died on the 26th Sept 1840
of Ashton in the State of Ohio
who was a Private in the Compy
commanded by Captain
of the
Regt commanded by
in
the Revolution line for 15 mo Private
Inscribed on the Roll of
Pittsburgh
at the rate of 30 Dollars
Cents per annum, to commence on the 4th day
of March, 1836
Certificate of Pension issued the 8th day of
July
1846 and sent to James
1
=
Miller
Total amount,
$250.00
Act July 7, 1838.}
Recorded by the Lumpkin
Clerk,
Book D Vol. 4 Page 399

[2] Grained 125th
George Keller
of Monongalia in the State of Elgin
who was a Private in the Compy command
by Captain Tompson of the regt
commande
by Col Christie in the Origina
line for 19 ½ months
---
$65.
Inscribed on the Roll of Virginia
at the rate of 60 Dollars 00 Cents per ann
to commence on the 4th day of March, 1831.
Certificate of Pe

In [55]:
df_filtered.head()

Unnamed: 0,NAID,naraURL,title,logicalDate,variantControlNumbers,pdfObjectID,pdfURL,pageObjectId,pageURL,pageImageType,...,ocrText,ocrUploadDate,ocrContributor,transcriptionID,transcriptionText,transcriptionContributionCount,transcriptionUserNames,transcriptionDate,file_cat,priority_text
4,111769418,https://catalog.archives.gov/id/111769418,Revolutionary War Pension and Bounty Land Warr...,,"[{""number"": ""Fold3 2018"", ""type"": ""Search Iden...",,,111769420,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,Image (JPG),...,1831\nto\nSchaled Tucker\n1941\nMassachusetts\...,2024-11-20T20:32:03.000Z,FamilySearch,,,,,,soldier,1831\nto\nSchaled Tucker\n1941\nMassachusetts\...
6,111769418,https://catalog.archives.gov/id/111769418,Revolutionary War Pension and Bounty Land Warr...,,"[{""number"": ""Fold3 2018"", ""type"": ""Search Iden...",,,111769422,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,Image (JPG),...,"ESSEX, ss.\nCommonwealth of Massachusetts.\nAt...",2024-11-20T20:32:03.000Z,FamilySearch,,,,,,soldier,"ESSEX, ss.\nCommonwealth of Massachusetts.\nAt..."
8,111769418,https://catalog.archives.gov/id/111769418,Revolutionary War Pension and Bounty Land Warr...,,"[{""number"": ""Fold3 2018"", ""type"": ""Search Iden...",,,111769424,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,Image (JPG),...,and at the same time he produces to the Court ...,2024-11-20T20:32:03.000Z,FamilySearch,,,,,,soldier,and at the same time he produces to the Court ...
15,111769405,https://catalog.archives.gov/id/111769405,Revolutionary War Pension and Bounty Land Warr...,,"[{""number"": ""Fold3 2018"", ""type"": ""Search Iden...",,,111769407,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,Image (JPG),...,Vermont 18.682\n---\nPeter Woodbury\nof Windso...,2024-11-20T20:32:03.000Z,FamilySearch,3fc8f1d1-c8be-468a-9d45-6183c4076266,Vermont 18.682\n---\nPeter Woodbury\nof Windso...,1.0,carlaMSUspartan,2025-07-27 18:31:59,soldier,Vermont 18.682\n---\nPeter Woodbury\nof Windso...
28,111769387,https://catalog.archives.gov/id/111769387,Revolutionary War Pension and Bounty Land Warr...,,"[{""number"": ""Fold3 2018"", ""type"": ""Search Iden...",,,111769390,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,Image (JPG),...,"Massachusetts 1,25\nMary Woodbury\nwidow of Na...",2024-11-20T20:32:02.000Z,FamilySearch,,,,,,widow,"Massachusetts 1,25\nMary Woodbury\nwidow of Na..."


In [None]:

df_filtered.columns

Index(['NAID', 'naraURL', 'title', 'logicalDate', 'variantControlNumbers',
       'pdfObjectID', 'pdfURL', 'pageObjectId', 'pageURL', 'pageImageType',
       'ocrID', 'ocrText', 'ocrUploadDate', 'ocrContributor',
       'transcriptionID', 'transcriptionText',
       'transcriptionContributionCount', 'transcriptionUserNames',
       'transcriptionDate', 'file_cat', 'priority_text'],
      dtype='object')

In [56]:
# Save directly with only the specified columns
cols = ["NAID", "naraURL", "title", "pageObjectId", "pageURL", "file_cat", "priority_text"]
df_filtered[cols].to_parquet('filtered_pension_amounts_subset.parquet', index=False)