### Extract award amounts 

In [None]:
import pandas as pd
import re

In [55]:
df = pd.read_parquet("../../quantitative/data/df_grouped_NAID_sorted_title_categories.parquet")

In [None]:
# search ocrText for phrases, case insensitive. If transcriptionText column is not null, search that instead. Return the percent of row that are not 'non_application' in file_cat that contain the phrases.
phrases = ['inscribed on the roll', 'at the rate', 'dollars per month', "dollars", "per annum", "on the roll", "act of"]

"on the roll of [place]"

"scribed on the roll"
"on the roll"
"rate of"
"per month"
"per annum"
"commence on the day of"
"allowance ending"
"semi anl"

# acts
"act of congress"
"act of"
"passed"
"passed" + "act"
"congress" + "act"
"under the law of"

"revolutionary claim, act [date]"
"Certificate of Pension issued the [number] day of [month] [year]"

"widow of"

# IGNORE AFTER 
"arrears"

"$40 per annum"

'per month'

https://catalog.archives.gov/id/196262530?objectPage=2
| **Field**                                            | **Meaning / Context**                                                                                                                                         |
| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Ohio 5496**                                        | Pension number or internal file number from the **Wheeling, Ohio Agency** (which covered parts of western Virginia and eastern Ohio).                         |
| **Nancy Minor**                                      | The **widow** of **Thomas Minor**, the Revolutionary War soldier.                                                                                             |
| **Thomas Minor, decd**                               | “decd” = deceased. Thomas Minor was already a pensioner before his death.                                                                                     |
| **Pensioner under the Act of 1818**                  | Thomas originally received a pension under the **Act of March 18, 1818**, which granted pensions to Continental Army veterans who could prove financial need. |
| **Died on 11 June 1834**                             | The date of the soldier’s death.                                                                                                                              |
| **Jefferson in the State of Ohio**                   | Residence of the widow (Jefferson County, Ohio).                                                                                                              |
| **Captain Russell / Col. Scott / Virginia line**     | Indicates Thomas Minor’s service unit — **Virginia Line**, under **Captain Russell** and **Colonel Scott**, for **one year**.                                 |
| **Inscribed on the Roll of Wheeling**                | The widow’s name was officially added (“inscribed”) to the **pension roll** at the Wheeling Agency.                                                           |
| **Rate: 40 Dollars per annum**                       | The widow received **$40 per year**, reflecting the pension scale set by Congress.                                                                            |
| **Commence on 4th March 1836**                       | Payment start date — often standardized for widows under the 1838 Act.                                                                                        |
| **Certificate of Pension issued 11 June 1840**       | The date the official pension certificate was issued.                                                                                                         |
| **Sent to Hon. Swearingen, H. Reps**                 | The certificate was forwarded to **Hon. Mr. Swearingen**, a **Member of the U.S. House of Representatives**, likely assisting in her application.             |
| **Arrears to 4 March 1840: $160.00**                 | Back pay owed from March 1836 through March 1840.                                                                                                             |
| **Semi-annual allowance ending 4 Sept 1840: $20.00** | Standard **half-year payment** following the arrears.                                                                                                         |
| **Total $180.00**                                    | Total payment due including arrears and current allowance.                                                                                                    |
| **Act July 7, 1838**                                 | The **Act of July 7, 1838**, which granted pensions to widows of Revolutionary War soldiers who had been married before the end of the war.                   |
| **D. McCurdy / Vol. Page 21**                        | Clerk’s name (**D. McCurdy**) and ledger reference (book and page number).                                                                                    |
| **Let 7 July 1840 / R. Sworn**                       | Probably notations for **letter sent** or **sworn affidavit received** on that date.                                                                          |


https://catalog.archives.gov/id/53838203?objectPage=27
issued August 13, 1833, at $38.32, per annum. from March 4, 1831, under the Act of June 7, 1832, at the Connecticut Agency

BOUNTY LAND WARRANT RECORD CARD
ex: https://catalog.archives.gov/id/54686039?objectPage=9

form with these fields:
"name"
"grade"
"line"
"warrant number"
"acreage"
"issued"

In [None]:
"bounty land warrant record card"
"name"
"grade"
"line"
"warrant number"
"acreage"
"issued"


'bount land warrant record card'

SOMETIMES THERE IS A REPORT AFTER THE FACT THAT INCLUDES
"four hundred acres of bounty land were issued on April 27, 1810, under Warrant 514"
"this land was issued by the United States. Land was also issued by the state of Virginia..."

"under the act of"
"300 acres of bounty land was issued October 20, 2810"
https://catalog.archives.gov/id/144269084?objectPage=8




https://catalog.archives.gov/id/54793849?objectPage=5
"400 acres issued the 7th Jany 1832 to"

"INVALID."

See if this one comes up often
https://catalog.archives.gov/id/54534765?objectPage=2

In [87]:
# df.head()

Filtering
- skip if only 1 file

In [None]:
import re
from functools import lru_cache

# Pre-compile regex patterns for better performance
PATTERNS = {
    'vv_to_w': re.compile(r'\bvv'),
    'digit_1_to_I': re.compile(r'(?<=[A-Za-z])1|1(?=[A-Za-z])'),
    'digit_0_to_O': re.compile(r'(?<=[A-Za-z])0|0(?=[A-Za-z])'),
    'digit_5_to_S': re.compile(r'(?<=[A-Za-z])5|5(?=[A-Za-z])'),
    'digit_8_to_B': re.compile(r'(?<=[A-Za-z])8|8(?=[A-Za-z])'),
    'contractions': re.compile(r"'\s+([dst])\b"),
    'hyphenation': re.compile(r'(\w+)-\s*\n\s*(\w+)'),
    'multiple_spaces': re.compile(r'\s+'),
    'multiple_newlines': re.compile(r'\n\s*\n'),
    'sentence_breaks': re.compile(r'([.!?])\s*\n+\s*([A-Z])'),
    'space_before_punct': re.compile(r'\s+([,.!?;:])'),
    'space_after_punct': re.compile(r'([,.!?;:])\s*')
}

def clean_up_text_fast(text):
    """
    Optimized version of clean_up_text with pre-compiled regex patterns.
    """
    if not text or pd.isna(text):
        return ''
    
    text = str(text)
    
    # Historical / OCR quirks
    text = text.replace('ſ', 's')  # long s
    text = PATTERNS['vv_to_w'].sub('w', text)  # vv → w
    text = text.replace('|', 'I')  # | → I
    
    # Replace digits that look like letters
    text = PATTERNS['digit_1_to_I'].sub('I', text)
    text = PATTERNS['digit_0_to_O'].sub('O', text)
    text = PATTERNS['digit_5_to_S'].sub('S', text)
    text = PATTERNS['digit_8_to_B'].sub('B', text)
    
    # Contraction fixes
    text = PATTERNS['contractions'].sub(r"'\1", text)
    
    # Hyphenation across line breaks
    text = PATTERNS['hyphenation'].sub(r'\1\2', text)
    
    # Paragraph/spacing cleanup
    text = PATTERNS['multiple_spaces'].sub(' ', text)
    text = PATTERNS['multiple_newlines'].sub('\n\n', text)
    text = PATTERNS['sentence_breaks'].sub(r'\1\n\n\2', text)
    
    # Punctuation spacing
    text = PATTERNS['space_before_punct'].sub(r'\1', text)
    text = PATTERNS['space_after_punct'].sub(r'\1 ', text)
    
    return text.strip()

# Test performance on a small sample
import time
sample_size = 1000
sample_df = df.head(sample_size).copy()

print(f"Testing performance on {sample_size} rows...")
start_time = time.time()

# Apply cleaning to sample
sample_df['cleaned_ocr'] = sample_df['ocrText'].apply(clean_up_text_fast)

end_time = time.time()
print(f"Time for {sample_size} rows: {end_time - start_time:.2f} seconds")
print(f"Estimated time for full dataset: {(end_time - start_time) * len(df) / sample_size / 60:.1f} minutes")


Testing performance on 1000 rows...
Time for 1000 rows: 7.22 seconds
Estimated time for full dataset: 9.5 minutes


In [88]:
# Build one regex pattern that matches any phrase (case-insensitive)
pattern = '|'.join(map(re.escape, phrases))

# Filter
mask = (
    (df["file_cat"] != "non_application") &  # skip non-application
    (
        # check transcriptionText if present
        df["transcriptionText"].fillna("").str.contains(pattern, case=False, regex=True)
        |
        # otherwise fall back to ocrText
        (
            (df["transcriptionText"].isna() | df["transcriptionText"].eq("")) &
            df["ocrText"].fillna("").str.contains(pattern, case=False, regex=True)
        )
    )
)

df_filtered = df[mask].copy()

KeyboardInterrupt: 

In [None]:
# df_filtered.shape
# df.shape

(53115, 14)

In [66]:
# Case-insensitive regex; make spaces flexible so "at   the   rate" still matches
pattern = '|'.join(re.escape(p).replace(r'\ ', r'\s+') for p in phrases)
regex = re.compile(pattern, re.IGNORECASE)

def _usable_text(s):
    if s is None:
        return None
    s = str(s).strip()
    return s if s else None

def extract_all_matches(row):
    """
    From transcriptionText (preferred) or ocrText, split on '||'
    and return ALL matching sections and their indices.
    Results are '||'-joined strings (or None if no matches).
    """
    trans = _usable_text(row.get("transcriptionText"))
    ocr = _usable_text(row.get("ocrText"))
    source = trans if trans is not None else ocr
    if not source:
        return pd.Series([None, None])

    # Split into sections; trim each
    sections = [sec.strip() for sec in source.split("||")]

    matched_sections = []
    matched_indices = []

    for i, sec in enumerate(sections):
        if regex.search(sec or ""):
            matched_sections.append(sec)
            matched_indices.append(str(i))  # keep as strings for joining

    if not matched_sections:
        return pd.Series([None, None])

    return pd.Series(["||".join(matched_sections), "||".join(matched_indices)])



In [67]:
# Add columns to df_filtered
df_filtered[["allowance_phrase", "allowance_phrase_idx"]] = df_filtered.apply(extract_all_matches, axis=1)

# (optional) quick peek
df_filtered[["allowance_phrase_idx", "allowance_phrase"]].head(10)

Unnamed: 0,allowance_phrase_idx,allowance_phrase
1206,15,"it\n21\nBoston Massachusetts\nOctober\n10, 629..."
1228,15||30||45||50,1891\nMay 2 Bradly history\nR\nConnecticut 531...
1460,55,State of Virginia\nto\nanifor\nCounty to wit:\...
1765,1||10||11||12,"[LEFT PAGE, written vertically]\n\nRolls sent ..."
1767,1,17 00402\n℗\nBeavertown Beaver County\nby even...
1778,19||23||33||42,To the Honorable the Secretary of the War Depa...
1820,1||11,9056\nConnecticut\n-\nAbner Cable\nMonroe in t...
1827,11||19,IN REPLY REFER TO Rev. War Section 1865\nDEPAR...
1875,2,City of Boston Massachusetts\nJanuary 24th 183...
2032,16,1\n96\nJames White a pensioner of the US stand...


In [70]:
# Show complete text for inspection
pd.set_option("display.max_colwidth", None)
df_filtered.loc[df_filtered.index[0], ["allowance_phrase_idx", "allowance_phrase"]]


allowance_phrase_idx                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 15
allowance_phrase        it\n21\nBoston Massachusetts\nOctober\n10, 629\nMassachusetts\n---\nWilliam Hendley\nin the state of Mass\nwas a Corporal in the regiment commanded by\nme, for the term of\nBrooks of the Mass\nscribed on the Roll of Massachusetts\nat the rate of -8- Dollars per month, to commence on\nthe 31 of March 1818\nertificate of Pension issued the 15 of May 1819\nand sent to Hon John Davis\nBoston, 

In [79]:
# # Show percentage distribution of file_cat in df_filtered
# print("File category distribution in df_filtered:")
# print("=" * 50)

# # Get counts and percentages
# file_cat_counts = df_filtered['file_cat'].value_counts()
# file_cat_percentages = df_filtered['file_cat'].value_counts(normalize=True) * 100

# # Create a summary dataframe
# summary_df = pd.DataFrame({
#     'Count': file_cat_counts,
#     'Percentage': file_cat_percentages.round(2)
# })

# print(summary_df)
# print(f"\nTotal rows: {len(df_filtered)}")

# # Also show the original distribution for comparison
# print("\n" + "=" * 50)
# print("For comparison - Original df file_cat distribution:")
# original_counts = df['file_cat'].value_counts()
# original_percentages = df['file_cat'].value_counts(normalize=True) * 100

# original_summary = pd.DataFrame({
#     'Count': original_counts,
#     'Percentage': original_percentages.round(2)
# })
# print(original_summary)
# print(f"Total rows in original: {len(df)}")

In [78]:
# # Show only percent change for each category
# print("Category percent change analysis:")
# print("=" * 50)

# # Get counts for both datasets
# original_counts = df['file_cat'].value_counts()
# filtered_counts = df_filtered['file_cat'].value_counts()

# # Create simple comparison
# comparison_df = pd.DataFrame({
#     'Original_Count': original_counts,
#     'Filtered_Count': filtered_counts.fillna(0)
# })

# # Calculate percent change
# comparison_df['Percent_Change'] = ((comparison_df['Filtered_Count'] - comparison_df['Original_Count']) / comparison_df['Original_Count'] * 100).round(1)

# # Sort by percent change (largest decreases first)
# comparison_df = comparison_df.sort_values('Percent_Change')

# # Show only the percent change
# # print(comparison_df[['Original_Count', 'Filtered_Count', 'Percent_Change']])
# print(comparison_df[[ 'Percent_Change']])

In [85]:
import json
import random

# Set random seed for reproducibility (optional)
random.seed(41)

# Filter to only rows that have allowance_phrase (not null/empty)
valid_phrases = df_filtered[df_filtered['allowance_phrase'].notna() & (df_filtered['allowance_phrase'] != '')]

# Get 30 random samples
sample_size = min(30, len(valid_phrases))
random_samples = valid_phrases.sample(n=sample_size, random_state=42)

# Create the data structure for JSON
samples_data = []
for idx, row in random_samples.iterrows():
    sample = {
        "NAID": int(row['NAID']),
        "allowance_phrase": row['allowance_phrase']
    }
    samples_data.append(sample)

# Save to JSON file
output_file = "allowance_phrase_samples_2.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(samples_data, f, indent=2, ensure_ascii=False)

print(f"Saved {len(samples_data)} random samples to {output_file}")
print(f"Sample NAIDs: {[sample['NAID'] for sample in samples_data[:5]]}...")


Saved 30 random samples to allowance_phrase_samples_2.json
Sample NAIDs: [196448950, 54311592, 54250294, 54779881, 54653690]...
