# Feature Extraction from Parsed PDFs

In [None]:
# SETUP

import pandas as pd
import re
from datetime import datetime
from pathlib import Path
import sys

# Add src to path
sys.path.append(str(Path.cwd().parent))

Sample content loaded!
Length: 1521 characters


In [None]:
# LOAD SAMPLE

# Sample parsed content from record0.pdf
sample_content = """User: PITKIN
# Napa Police Department

04/05/2023 17:34:42

# Case Management Tracking

| Time                | Action     | Description                                                                                                                                                    | Officer          | Hours Spent |
| ------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------- | ----------- |
| 06/02/2022 07:35:11 | OFFICER    | (320140) PIERSIG, PETER assigned (320126) UPCHURCH, KYLE to Case as LEAD INVESTIGATOR                                                                          | Piersig, Peter   | 0.00        |
| 06/02/2022 11:08:20 | CLEARED BY | Closed by arrest made by (320494) BARRERA, ADAM                                                                                                                | Barrera, Adam    | 0.00        |
| 06/13/2022 12:52:23 | CMD.       | CMD. REVIEW: The Use of Force as described by the involved officers appeared lawful, reasonable, necessary and within policy- CH109                            | Haag, Chase      | 0.50        |
| 07/08/2022 15:31:54 | CMD.       | CMD. REVIEW: Upon reviewing the associated police reports and pertinent BWCs, I believe the use of force was reasonable, necessary and within policy. FR #237. | Rodriguez, Fabio | 1.00        |

R_Track1    Page  1    Printed: April 5, 2023
"""

print(f"Length: {len(sample_content)} characters")

Sample content loaded
Length: 1521 characters


In [None]:
# EXTRACT DATA FROM MARKDOWN TABLES

"""
Markdown table example:
| Header1 | Header2 |
| ------- | ------- |
| Data1   | Data2   |
"""

def extract_table_from_markdown(markdown_text):
    lines = markdown_text.split('\n')
    table_lines = [line.strip() for line in lines if line.strip().startswith('|')]

    if len(table_lines) < 2:
        return None

    headers = [col.strip() for col in table_lines[0].split('|')[1:-1]]  # First and last are empty

    data_rows = []
    for line in table_lines[2:]: # Skip horizontal separator line
        row = [col.strip() for col in line.split('|')[1:-1]] # First and last are empty
        data_rows.append(row)

    # Create DataFrame
    df = pd.DataFrame(data_rows, columns=headers)
    return df


df_actions = extract_table_from_markdown(sample_content)
df_actions

Unnamed: 0,Time,Action,Description,Officer,Hours Spent
0,06/02/2022 07:35:11,OFFICER,"(320140) PIERSIG, PETER assigned (320126) UPCH...","Piersig, Peter",0.0
1,06/02/2022 11:08:20,CLEARED BY,"Closed by arrest made by (320494) BARRERA, ADAM","Barrera, Adam",0.0
2,06/13/2022 12:52:23,CMD.,CMD. REVIEW: The Use of Force as described by ...,"Haag, Chase",0.5
3,07/08/2022 15:31:54,CMD.,CMD. REVIEW: Upon reviewing the associated pol...,"Rodriguez, Fabio",1.0


In [20]:
# Convert time column to datetime
df_actions['Time'] = pd.to_datetime(df_actions['Time'], format='%m/%d/%Y %H:%M:%S')

# Convert to appropriate data types
df_actions['Hours Spent'] = pd.to_numeric(df_actions['Hours Spent'])
string_columns = ['Action', 'Description', 'Officer']
df_actions[string_columns] = df_actions[string_columns].astype(str)


print(df_actions.dtypes)
df_actions

Time           datetime64[ns]
Action                 object
Description            object
Officer                object
Hours Spent           float64
dtype: object


Unnamed: 0,Time,Action,Description,Officer,Hours Spent
0,2022-06-02 07:35:11,OFFICER,"(320140) PIERSIG, PETER assigned (320126) UPCH...","Piersig, Peter",0.0
1,2022-06-02 11:08:20,CLEARED BY,"Closed by arrest made by (320494) BARRERA, ADAM","Barrera, Adam",0.0
2,2022-06-13 12:52:23,CMD.,CMD. REVIEW: The Use of Force as described by ...,"Haag, Chase",0.5
3,2022-07-08 15:31:54,CMD.,CMD. REVIEW: Upon reviewing the associated pol...,"Rodriguez, Fabio",1.0


In [None]:
# EXTRACT FEATURES

def extract_features_from_table(df):
    features = {}

    features['num_actions'] = len(df)
    features['num_officers'] = df['Officer'].nunique()

    features['start_date'] = df['Time'].min()
    features['end_date'] = df['Time'].max()
    features['case_duration_days'] = (features['end_date'] - features['start_date']).days
    features['total_hours'] = df['Hours Spent'].sum()

    features['action_types'] = df['Action'].value_counts().to_dict()

    features['involves_use_of_force'] = any('force' in desc.lower() for desc in df['Description'])
    features['involves_arrest'] = any('arrest' in desc.lower() for desc in df['Description'])
    features['involves_bwc'] = any('bwc' in desc.lower() for desc in df['Description'])

    return features

features = extract_features_from_table(df_actions)

print("Extracted Features:")
for key, value in features.items():
    print(f"{key:30s}: {value}")

Extracted Features:
num_actions                   : 4
num_officers                  : 4
start_date                    : 2022-06-02 07:35:11
end_date                      : 2022-07-08 15:31:54
case_duration_days            : 36
total_hours                   : 1.5
action_types                  : {'CMD.': 2, 'OFFICER': 1, 'CLEARED BY': 1}
involves_use_of_force         : True
involves_arrest               : True
involves_bwc                  : True
