In [None]:
import re, os, json, time
import ipywidgets as widgets
import pandas as pd

from pdfminer.high_level import extract_text
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from openai import OpenAI
from io import StringIO


client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

pdf_path = "ADD_PATH_TO_FILE"
word_template_path = "ADD_PATH_OF_TEMPLATE"
word_output_path = "ADD_PATH_OF_OUTPUT"

In [None]:
# Test OpenAI API with a simple response generation
response = client.responses.create(
    model="gpt-4o-mini-2024-07-18",
    instructions="You are helping to extract specific fields from the ",
    input="How do I check if a Python object is an instance of a class?",
)
print(response.output_text)

In [15]:
def call_llm_model(instructions, input_text, entry_max_length):

    print(len(input_text))

    responses_list = []

    req_window = min(len(input_text), 40000)

    index_end = req_window
    while True:

        index_start = max(index_end-req_window-entry_max_length, 0)

        print(f"Processing from index {index_start} to {index_end}...")

        while True:
            try:
                response = client.responses.create(
                    model="gpt-4o-2024-08-06", #"gpt-4o-mini-2024-07-18",
                    instructions=instructions,
                    input=input_text[index_start:index_end]
                )
            except:
                pass

            if len(response.output_text) > 100:
                break
            else:
                print('error', response)
                print('retry in 3 seconds')
                time.sleep(3)


        responses_list.append(response.output_text)

        if len(input_text) <= index_end:
            print("Reached the end of the text.")
            break

        index_end += req_window

    return responses_list

In [49]:
pdf_text = extract_text(pdf_path)#, page_numbers=range(0,10))

In [None]:
text_widget = widgets.Textarea(value=pdf_text, layout={'width': '100%', 'height': '600px'})
display(text_widget)

In [51]:
pdf_cleaned = text_widget.value  # Captures human-edited version

pdf_sections = pdf_cleaned.split('FINE SEZIONE')

assert len(pdf_sections) == 3, "Expected 3 sections in the PDF text"

In [None]:
# Define the instructions and prompt for the API
instructions = (
    "You are a text-processing assistant. Your task is to parse text. You have a list of individuals [indicated by a running index 1), 2), 3) or 1. 2. 3.] with their anagraphical information;"
    "Extract each individual's information in this format (index, the page of pdf in which the individual data is, name, aliases, place of birth, birth date, residence)."
    "If an entry has the field 'stralcio' o 'posizione stralciata' then make 'posizione stralciata' the name and put NA in the other fields."
    "The output should be a JSON that I can import in Pandas with: pd.read_json(response.output_text.strip(), orient='records'). "
    "Ignore any irrelevant content such as headers, footers, and introductory text. If any field is missing, use NA for that field. "
    )

input_text = pdf_sections[0]

responses_names = call_llm_model(instructions, input_text, entry_max_length=500)


In [None]:
responses_names

In [None]:
dfs = [pd.read_json(StringIO(x.replace('```json', '').replace('```', '').strip()), orient='records', lines=True) for x in responses_names if x.strip()]

df_names = pd.concat(dfs, ignore_index=True).drop_duplicates(subset=['index']).sort_values(by='index').reset_index(drop=True)

In [None]:
df_names

In [None]:
text_widget = widgets.Textarea(value=pdf_sections[0], layout={'width': '100%', 'height': '600px'})
display(text_widget)

In [None]:
text_widget = widgets.Textarea(value=pdf_sections[1], layout={'width': '100%', 'height': '600px'})
display(text_widget)

In [None]:

# Define the instructions and prompt for the API
instructions = (
    "You are a text-processing assistant. Your task is to parse text. The input text is a list of crimes [indicated by a running index 1), 2), 3) or 1. 2. 3., or A), B), C)], "
    "each committed by a subset of the individuals. For a crime I'm interested to know if it's sancioned by 416.bis law."
    "Extract each crime's information in this format (index, the page of pdf in which the crime data is, column named '416_bis' to flag if the crime is sanctioned by the 416.bis law"
    "[fill this column with either Yes or No], list of individuals that committed this crime)."
    "Ensure that all the crimes are parsed out, even if they are not 416.bis. "
    "The output should be a JSON that I can import in Pandas with: pd.read_json(response.output_text.strip(), orient='records'). "
    "Ignore any irrelevant content such as headers, footers, and introductory text. If any field is missing, use NA for that field. "
    )

# "This is the list of individuals: " + df_names['name'].str.cat(sep=', ') + ". 
# print(len(pdf_sections[1]))

input_text = pdf_sections[1]

responses_crimes = call_llm_model(instructions, input_text, entry_max_length=10000)

In [None]:
dfs2 = [pd.read_json(StringIO(x.replace('```json', '').replace('```', '').strip()), orient='records') for x in responses_crimes if x and x.strip()]

df_crimes = pd.concat(dfs2, ignore_index=True).drop_duplicates(subset=['index']).sort_values(by='index').reset_index(drop=True)
df_crimes

In [None]:
text_widget = widgets.Textarea(value=pdf_sections[1], layout={'width': '100%', 'height': '500px'})
display(text_widget)

In [None]:
# Filter only 416.bis crimes
df_416bis = df_crimes[df_crimes["416_bis"].str.lower() == "yes"]

# Get a set of all individuals involved in at least one 416.bis crime
individuals_416bis = set(sum(df_416bis["individuals"], []))

# Add a column to df_names indicating if the individual committed at least one 416.bis crime
df_names["committed_416bis"] = df_names["name"].apply(lambda x: x in individuals_416bis)

print(len(individuals_416bis), individuals_416bis)

df_names = df_names[(df_names['name']!= 'NA') & ~df_names['name'].str.contains('stralciata')]
df_names

In [None]:
# Load the original document
doc = Document(word_template_path)

for paragraph in doc.paragraphs:
    if '############################## ADD PEOPLE ################################' in paragraph.text:
        # Clear the paragraph
        paragraph.clear()
        paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
        # Add new text
        paragraph.add_run(', '.join(f'{i+1}) {person}' for i, person in enumerate(df_names['name'].tolist())))

    elif '############################## ADD 416bis PEOPLE ###############################' in paragraph.text:
        # Clear the paragraph
        paragraph.clear()
        paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
        # Add new text
        paragraph.add_run(', '.join(f'{i+1}) {person}' for i, person in enumerate(df_names[df_names['committed_416bis']]['name'].tolist())))


# Save the document to a new file
doc.save(word_output_path)
