#### normalize llm extracted values to return valid data points for frontend use / visualization

1. frequency
2. yearly amount 
3. act date
4. place  

In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import pandas as pd
import ast
# import json

In [3]:
from normalize_frequency import normalize_pension_frequency
from normalize_yearly_amount import normalize_yearly_amount
from normalize_pension_act_date import get_known_act_date
from normalize_state import normalize_place

#### change this df in future - can then apply rest of notebook as is

In [4]:
df = pd.read_csv('extracted_amounts_sample_1000_pre_normalization.csv')

In [5]:
df.columns
df.shape

(872, 9)

##### frequency

In [6]:
# Apply the normalization function to create a new column
df['normalized_payment_frequency'] = df['llm_extracted_pension_amount'].apply(
    lambda x: normalize_pension_frequency(
        ast.literal_eval(x).get('pension_frequency') if isinstance(x, str) else x.get('pension_frequency')
    ) if (isinstance(x, str) or isinstance(x, dict)) else 'unknown'
)

In [7]:
print(df['normalized_payment_frequency'].value_counts(dropna=False))

normalized_payment_frequency
annual         675
monthly        162
unknown         31
semi-annual      4
Name: count, dtype: int64


##### yearly amount

In [8]:
df['normalized_yearly_amount'] = df.apply(
    lambda row: normalize_yearly_amount(
        (ast.literal_eval(row['llm_extracted_pension_amount']) if isinstance(row['llm_extracted_pension_amount'], str) else row['llm_extracted_pension_amount']).get('pension_amount'),
        row['normalized_payment_frequency']
    ) if isinstance(row['llm_extracted_pension_amount'], (str, dict)) else None,
    axis=1
)

In [9]:
# df.head()
print(df['normalized_yearly_amount'].describe())

count      838.000000
mean       133.096021
std       1042.036960
min          2.000000
25%         33.505000
50%         80.000000
75%         96.000000
max      28932.000000
Name: normalized_yearly_amount, dtype: float64


#### get known act dates

In [10]:
df['known_act_date'] = df.apply(
    lambda row: get_known_act_date(
        (ast.literal_eval(row['llm_extracted_pension_amount']) if isinstance(row['llm_extracted_pension_amount'], str) else row['llm_extracted_pension_amount']).get('pension_act')
    ) if isinstance(row['llm_extracted_pension_amount'], (str, dict)) else None,
    axis=1
)

In [11]:
# View the results
print(df['known_act_date'].value_counts(dropna=False))
# print(df['known_act_date'].notna().sum())

known_act_date
None          296
1832-06-07    251
1818-03-18    126
1838-07-07     94
1843-03-03     69
1836-07-04     29
1855-03-03      4
1828-05-15      2
1820-05-01      1
Name: count, dtype: int64


#### normalize place

In [12]:
from normalize_state import get_non_standard_places
from normalize_state import state_mapping

In [13]:
# Add extracted_place column (full result of normalize_place)
df['extracted_place'] = df['llm_extracted_pension_amount'].apply(
    lambda x: normalize_place(
        ast.literal_eval(x).get('place') if isinstance(x, str) else (x.get('place') if isinstance(x, dict) else None)
    ) if (isinstance(x, str) or isinstance(x, dict)) else None
)

# Get the standard states from the mapping
standard_states = set(state_mapping.keys())

# Add normalized_place column (only standard state mappings, otherwise None)
df['normalized_place'] = df['extracted_place'].apply(
    lambda x: x if x in standard_states else None
)

In [14]:
display(df['normalized_place'].value_counts(dropna=False))

normalized_place
new york           152
None               115
massachusetts       70
new hampshire       53
pennsylvania        48
connecticut         48
virginia            47
maine               46
vermont             45
ohio                44
north carolina      39
kentucky            35
tennessee           31
new jersey          20
indiana             15
georgia             14
rhode island        13
alabama              8
maryland             7
illinois             7
michigan             4
missouri             4
mississippi          4
south carolina       1
iowa                 1
washington d.c.      1
Name: count, dtype: int64

In [15]:
display(get_non_standard_places(df))

Non-standard normalized place values (not in state_mapping):


normalized_place
None    115
Name: count, dtype: int64

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,NAID,naraURL,title,pageObjectId,pageURL,file_cat,llm_extracted_pension_amount,llm_extracted_pension_amount_dollars,normalized_payment_frequency,normalized_yearly_amount,known_act_date,extracted_place,normalized_place
0,0,111708949,https://catalog.archives.gov/id/111708949,Revolutionary War Pension and Bounty Land Warr...,111708987,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,widow,"{'applicant_name': 'Eunice Wilder', 'applicant...",True,annual,30.0,1838-07-07,ohio,ohio
1,1,196100358,https://catalog.archives.gov/id/196100358,Revolutionary War Pension and Bounty Land Warr...,196100360,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,soldier,"{'applicant_name': 'George Keller', 'applicant...",True,annual,60.0,1832-06-07,Elgin,
2,2,111503547,https://catalog.archives.gov/id/111503547,Revolutionary War Pension and Bounty Land Warr...,111503552,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,widow,"{'applicant_name': 'Mildred Hart', 'applicant_...",True,annual,26.1,1838-07-07,virginia,virginia
3,3,54362674,https://catalog.archives.gov/id/54362674,Revolutionary War Pension and Bounty Land Warr...,54362689,https://s3.amazonaws.com/NARAprodstorage/opast...,widow,"{'applicant_name': 'Mary Date', 'applicant_typ...",True,annual,44.0,1838-07-07,kentucky,kentucky
4,4,54330151,https://catalog.archives.gov/id/54330151,Revolutionary War Pension and Bounty Land Warr...,54330216,https://s3.amazonaws.com/NARAprodstorage/opast...,widow,"{'applicant_name': 'John Crawford', 'applicant...",True,annual,80.0,1832-06-07,ohio,ohio


#### add applicant type as new column for FE ease

In [17]:
df['extracted_applicant_type'] = df['llm_extracted_pension_amount'].apply(
    lambda x: (
        ast.literal_eval(x).get('applicant_type') if isinstance(x, str) else x.get('applicant_type')
    ) if (isinstance(x, str) or isinstance(x, dict)) else None
)

In [21]:
print(df['extracted_applicant_type'].value_counts(dropna=False))

extracted_applicant_type
widow      424
soldier    421
None        24
unknown      2
null         1
Name: count, dtype: int64


In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,NAID,naraURL,title,pageObjectId,pageURL,file_cat,llm_extracted_pension_amount,llm_extracted_pension_amount_dollars,normalized_payment_frequency,normalized_yearly_amount,known_act_date,extracted_place,normalized_place,extracted_applicant_type
0,0,111708949,https://catalog.archives.gov/id/111708949,Revolutionary War Pension and Bounty Land Warr...,111708987,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,widow,"{'applicant_name': 'Eunice Wilder', 'applicant...",True,annual,30.0,1838-07-07,ohio,ohio,widow
1,1,196100358,https://catalog.archives.gov/id/196100358,Revolutionary War Pension and Bounty Land Warr...,196100360,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,soldier,"{'applicant_name': 'George Keller', 'applicant...",True,annual,60.0,1832-06-07,Elgin,,soldier
2,2,111503547,https://catalog.archives.gov/id/111503547,Revolutionary War Pension and Bounty Land Warr...,111503552,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,widow,"{'applicant_name': 'Mildred Hart', 'applicant_...",True,annual,26.1,1838-07-07,virginia,virginia,widow
3,3,54362674,https://catalog.archives.gov/id/54362674,Revolutionary War Pension and Bounty Land Warr...,54362689,https://s3.amazonaws.com/NARAprodstorage/opast...,widow,"{'applicant_name': 'Mary Date', 'applicant_typ...",True,annual,44.0,1838-07-07,kentucky,kentucky,widow
4,4,54330151,https://catalog.archives.gov/id/54330151,Revolutionary War Pension and Bounty Land Warr...,54330216,https://s3.amazonaws.com/NARAprodstorage/opast...,widow,"{'applicant_name': 'John Crawford', 'applicant...",True,annual,80.0,1832-06-07,ohio,ohio,soldier


In [22]:
df.to_csv('extracted_amounts_sample_1000_post_normalization.csv')