In [11]:
import os
from anthropic import Anthropic

import base64
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
    raise ValueError("No ANTHROPIC_API_KEY found in environment variables.")

# Create Anthropic client
client = Anthropic(api_key = api_key)

In [12]:
# The data directory of png files to transcribe
# Download national archive files here:
# https://evidence-hou.se/events/big-llm-hack-24/data/correspondence.html
# dir = r"C:\Users\pobey\My Drive\Adam\Hackathon\data"

# Define the path to the image
image_path = '/Users/oscarmoxon/Desktop/AI Projects/hackathon/data_files/PURSGLOVE/2024-04-10_Defence_e8d.png'
image_media_type = 'image/png'  # This should match the image's format

In [13]:
def prompt():
    fields_string = "\n".join(f"- {field['title']}: {field['description']}. Wrap this field in <{field['tag']}> tags." for field in fields)
    return f"""
        Transcribe the text in this image in full, wrapped in <transcription> tags.

Please also extract the following fields:

{fields_string}
        """.strip()

# fields = [
#   {
#     'title': "Department",
#     'tag': 'department',
#     'description': 'The government department which has written the response to the query (in Capital Letters), or N/A if unknown'
#   },
#   {
#     'title': "Subject",
#     'tag': 'subject',
#     'description': 'The specifics of the nature of the contact e.g. Arms Trade: Israel, otherwise infer this yourself from the context'
#   },
#   {
#     'title': "Constituent",
#     'tag': 'constituent',
#     'description': 'The name of the constituent who has written the question'
#   },
#   {
#     'title': "Member of Parliament",
#     'tag': 'mp',
#     'description': "The name of the Member of Parliament who has responded to written the constituent's question"
#   }
# ]

fields = [  
  {
    'title': "Department",
    'tag': 'department',
    'description': 'This is always capitalised. The government department which has written the response to the query.'
  },
  {
    'title': "Subject",
    'tag': 'subject',
    'description': 'The specifics of the nature of the contact e.g. Arms Trade: Israel, otherwise infer this yourself from the context. Usually a colon separates the subject from a short description. Include this.'
  },
  {
    'title': "Questioner",
    'tag': 'questioner',
    'description': 'The name of the person asking the question. There may be multiple names of questioners , they are identified by not being indented.'
  },
  {
    'title': "ID",
    'tag': 'id',
    'description': 'The ID of the response in square brackets. Also return the URL associated with the hyperlink.'
  },
  {
    'title': "Question",
    'tag': 'question',
    'description': 'The question being asked. There may be multiple questions being asked, they are identified by not being indented.'
  },
  {
    'title': "Respondent",
    'tag': 'respondent',
    'description': "Identify this by the indent. The name of the Member of Parliament who has responded to written the question"
  },
  {
    'title': "Answer",
    'tag': 'answer',
    'description': "Identify this by the indent. The answer provided by Parliament to the question."
  }
]

print(prompt())

Transcribe the text in this image in full, wrapped in <transcription> tags.

Please also extract the following fields:

- Department: The government department which has written the response to the query (in Capital Letters), or N/A if unknown. Wrap this field in <department> tags.
- Subject: The specifics of the nature of the contact e.g. Arms Trade: Israel, otherwise infer this yourself from the context. Wrap this field in <subject> tags.
- Constituent: The name of the constituent who has written the question. Wrap this field in <constituent> tags.
- Member of Parliament: The name of the Member of Parliament who has responded to written the constituent's question. Wrap this field in <mp> tags.


In [14]:
# os.makedirs(os.path.join(dir, transcripts_dir_name), exist_ok=True)

# Load the image and encode in base64
with open(image_path, "rb") as image_file:
    image_data = base64.b64encode(image_file.read()).decode("utf-8")

# Create the message payload for the API
response = client.messages.create(
    model="claude-3-haiku-20240307",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": image_media_type,
                        "data": image_data,
                    },
                },
                {"type": "text", "text": prompt()}
            ],
        }
    ],
)

parsed_text = response.content[0].text  # This line may need adjustment based on actual API response structure

# Output the transcribed text
print("Transcribed Text:")
print(parsed_text)

Transcribed Text:
<transcription>
Visas: Skilled Workers

Deidre Brock:

To ask the Secretary of State for the Home Department, with reference to the Migration Advisory Committee's Rapid review of the Immigration Salary List, published on 23 February 2024, what reason his Department has not implemented the recommendation on the use of the Immigration Salary List beyond the skilled worker route for asylum seekers.

Tom Pursglove:
Replacing the Shortage Occupation List with the new Immigration Salary List will maintain the important principles that underpin our approach to permission to work and is in line with wider changes to the Immigration Rules.

Unrestricted access to employment could act as an incentive for more migrants to choose to come here illegally, with many making dangerous journeys across the Channel and supporting the business model of evil people smugglers, rather than taking the first safe country they reach.

The Government considers it important to distinguish between