In [None]:
%pip install anthropic pdf2image

In [None]:
#!pip install docx2pdf
#from docx2pdf import convert

#convert("/content/attachments_2025_03_12/attachments/01b71cafbde54c30a28dbbb0b19961d1)36C26225P0768.docx", "content/output.pdf")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create a folder in the root directory
#!mkdir -p "/content/drive/My Drive/attachments_2025_03_12"

# Write a sample file to Google Drive
#with open('/content/drive/My Drive/My Folder/sample.txt', 'w') as f:
#  f.write('Hello, World!')
#!rm -rf "/content/attachments_2025_03_12/"

#!rm -rf "/content/attachments/"
!cp -r "/content/drive/My Drive/attachments_2026_01_21" "/content/"

In [None]:
import json
import os
import anthropic
import base64
from pathlib import Path
from google.colab import userdata
import time

def read_file_as_base64(file_path):
    """Read a file and encode it as base64."""
    try:
        with open(file_path, 'rb') as file:
            return base64.b64encode(file.read()).decode('utf-8')
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def build_cached_system_prompt():
    """
    Build the system prompt with cached teaching materials.
    Returns a list of system message blocks with cache controls.
    """

    # Block 1: Teaching Articles (largest, changes least often)
    teaching_articles = {
        "type": "text",
        "text": """REFERENCE MATERIALS FOR SOLICITATION ANALYSIS

=== ARTICLE 1: Understanding Solicitation Sins ===

Solicitation Sin #1: Requirement Sprawl

One trap snarling solicitations is requirement sprawl, when interested
vendors have to address long lists of niche, ambiguous, and misguided
requirements. The basic mistake here is prejudging how a task should
be accomplished.

Example of requirements sprawl:
A performance work statement for an Army call center lists 11 bullets
detailing exactly how Application Liaisons "shall provide ongoing support,
then another paragraph dictating that liaisons should monitor call center reps'
service by "comparing the solution provided in the Incident
entered by the CSR and the solution provided in the Knowledge Database."

Excerpt from solicitiation showing requirement sprawl:
1.3.3.2 Subtask 2 – Application Liaison
The contractor shall provide liaison support for each application.
Liaisons shall coordinate issues, problems, resolutions, and specific system support with the COR and Area
Manager (AM). An AM is a government application manager who works in conjunction with that application’s
Program Manager. A list of AMs will be provided by the Government at contract award. Liaisons shall review
and analyze process rules based on information gathered and analyzed from SOP; ServiceNow, ICMS
statistics; and common customer problems. The Liaison shall provide ongoing support to include:
• Generate reports to include routine system reports
• Notify the AM to monitor high priority Incidents
• Update the AM on status of high priority Incidents received by the E2E SMC
• Ensure Incidents are kept updated when providing information on high priority servicesrequests
• Review, research and ensure complex customer Incidents that require escalation meet the stated criteria
• Provide system(s) checks on an approved schedule
• Identify and correct specific problems with individual CSRs that could impact customer service
• Develop scripts when required
• Ensure all CSRs have the appropriate training to provide a professional, quality, and knowledgeable
response to the customer
• Review E2E SMC public information and CSR/customer discussions to ensure security guidelines
are followed
• Identify and document Tier 2 requirements for each application and provide a listing and summary
of scope in the E2E SMC SOP.

Example of NOT requirement sprawl:
"The contractor shall provide a secure method to submit monthly reports in PDF or Excel format."
This is focused and mission-essential.

Example of NOT requirement sprawl:
"The interface should comply with Section 508 accessibility standards"
Legally mandates or safety specifications are not requirement sprawl.

The problem with requirement sprawl isn't just length, although long RFPs do
scare away potential responses. Sprawling requirements prejudge the
solution rather than invite vendors' best ideas.
Instead of Application Liaisons manually reviewing files,
you could automate data validation, or have reps provide peer
reviews to their coworkers. Here, the government will only allow one "how."
This "how" will add cost: vendors are incentivized to price a lot
of application liaisons into their
proposal to be responsive to the requirements.

Even worse, a "how" that makes sense now might not hold up to
policy changes, new tech needs, or a million other "unexpected"
(entirely predictable) implementation challenges.

Specificity isn’t always requirements sprawl — for example if you’re
building a safety-critical system or there are legal obligations
like accessibility or environmental protections.

=== ARTICLE 2: Compliance Over Outcomes ===

Solicitations err when they mandate QA (quality assurance) processes that
assess compliance instead of outcomes. When a quality assurance process assures
low quality, it’s time for some Q&A: question whether the "how" is
achieving an agency’s goals, and answer by changing course if necessary.

QA without Q&A generates copious documentation that a vendor stuck to
the plan, but doesn’t ask whether the key outcomes would be achieved.
There might not even be outcomes to target. Take "Task Area 1" (of 9) for
modernization of the Defense Transportation System's Integrated Booking System.
There are regular weekly, monthly, and quarterly meetings, each demanding
specific reports. The weekly reports by themselves need to talk about
personnel, development, maintenance, audit readiness, and cyber activities.

Example of compliance over outcomes:

"WEEKLY STATUS REPORT
The contractor shall provide a Weekly Status Report (WSR), shall outline the status of all new
application development, enhancement, and maintenance activities and shall contain issues for
discussion during the weekly development meeting attended by the IBS Program Manager (PM),
COR/ACOR, and selected contractor personnel. The weekly development meeting is traditionally
held on Wednesdays via teleconference; however, the schedule is subject to change. The contractor
shall provide the WSR no later than (NLT) the close of business (COB) on the day of the weekly
development meeting.
The WSR shall contain the following:
• Development in process
• Financial Improvement and Audit Readiness (FIAR) activities
• Cyber activities
• Releases
• Development planned
• Maintenance in process
• Maintenance planned
• Personnel leave"

Management-by-status reports can make sense if the tasks that need doing
can all be specified in advance, and one phase neatly proceeds from another.
Projects structured this way follow the “waterfall” model. Waterfall has
plunged out of use, and the popular alternative Agile prioritizes "responding
to change over following a plan." QA that centers compliance over outcomes is
a remnant of the waterfall era, when "responding to change" was a
transgression against the sacred, sequential plan.

NOT an example of compliance over outcomes:
"Program Management Review (PMR) Meeting. A management level
review "held by a System Contracting Office or Systems Program Manager (PM) for
the purpose of determining the status of program requirements and operational
health of the MMO. PMRs are designed as tools to communicate program
successes, identify problems, concerns, or areas for improvement,
if any, and to develop appropriate follow-up actions as required."
A quarterly meeting to review successes and identify problems is not excessive.
If the PMR is just one of many required meetings, though, this can be an
example of compliance over outcomes.


END OF REFERENCE MATERIALS
""",
        "cache_control": {"type": "ephemeral"}
    }

    # Block 2: Annotated Examples (medium size, occasional updates)
    annotated_examples = {
        "type": "text",
        "text": """ANNOTATED EXAMPLES OF ANALYSIS

=== EXAMPLE 1: Document with Requirement Sprawl ===

Title: "Cloud Storage Services"
Description: "The contractor shall provide cloud storage with: (1) 99.999% uptime SLA,
(2) support for 47 different file formats, (3) real-time collaboration for 10,000
concurrent users, (4) AI-powered search with natural language processing, (5) blockchain
audit trail, (6) integration with 25 legacy systems, (7) mobile apps for iOS, Android,
and Windows Phone, (8) API supporting REST, SOAP, GraphQL, and gRPC..."

CORRECT ANALYSIS:
{
  "has_requirement_sprawl": "yes",
  "requirement_sprawl_examples": [
    "Support for 47 different file formats - excessive for typical storage needs",
    "Windows Phone mobile app - platform is discontinued, not mission-essential",
    "Blockchain audit trail - adds complexity without clear mission need",
    "Multiple API protocols (REST, SOAP, GraphQL, gRPC) - one or two would suffice"
  ],
  "compliance_over_outcomes": null,
  "compliance_over_outcomes_examples": [],
  "contract_type": null,
  "performance_period": null
}

WHY: The solicitation includes many requirements that go beyond
basic cloud storage needs. The examples highlight specific requirements
that are excessive or unnecessary. There is not enough information in this
excerpt to evaluative compliance_over_outcomes or find contract_type,
performance_period, and evaluation_criteria.

=== Example 2: Document with Compliance Over Outcomes ===

Title: "Integrated Booking System (IBS)"
Description: "The contractor shall prepare a monthly
progress, status, and management
report summarizing work performed in
the previous month, cost status,
schedule status, current efforts,
completion/expiration dates, any
problems encountered, any travel
reports, and plans for the following
month."

WHY: The solicitation requires a monthly report which focuses on keeping
to a plan previously established (like "schedule status").

=== Example 3: Document with Compliance Over Outcomes ===

Title: "Enterprise Server Endpoint Management"
"The Contractor shall provide the COR with Monthly Activity Reports in
electronic form in Microsoft Word and Project formats.
The report shall include detailed instructions/explanations
for each required data element, to ensure that data is accurate and consistent.
 These reports shall reflect data as of the last day of the preceding Month.
The Monthly Activity Reports shall cover all work completed during the reporting
 period and work planned for the subsequent reporting period broken down by each
  task described in Section 5 of this PWS.  The report shall also identify any
  problems that arose and a description of how the problems were resolved.
  If problems have not been completely resolved, the Contractor shall provide
  an explanation including their plan and timeframe for resolving the issue.
  The report shall also include an itemized list of all Information and
  Communication Technology (ICT) deliverables and their current Section 508
  conformance status. The Contractor shall monitor performance against the CPMP
  and report any deviations.  It is expected that the Contractor will keep in
  communication with VA accordingly so that issues that arise are transparent to
  both parties to prevent escalation of outstanding issues.
In addition, the Contractor shall attend teleconference meetings with the
VA Task Leads to discuss and document any issues, pending deliverables, or
other pertinent topics concerning the task areas on a cadence matching the
individual Task Lead requirements, no less than weekly, but no more
 than twice daily."

CORRECT ANALYSIS:
{
  "has_requirement_sprawl": null,
  "requirement_sprawl_examples": [
    "'The report shall also include an itemized list of all Information and
  Communication Technology (ICT) deliverables and their current Section 508
  conformance status.' This focuses on compliance with regulations instead of performance",
  "'The Contractor shall provide the COR with Monthly Activity Reports in
  electronic form in Microsoft Word and Project formats'. Reports required in multiple formats.",
  "'In addition, the Contractor shall attend teleconference meetings ... no less
  than weekly, but no more than twice daily.' Required meetings weekly, and as
  often as twice daily."
  ],
  "compliance_over_outcomes": "yes",
  "compliance_over_outcomes_examples": [],
  "contract_type": null,
  "performance_period": null,
  "evaluation_criteria": []
}

WHY:
The solicitation requires a monthly report in multiple formats, with the report
focusing on progress towards the project plan (CPMP) rather than performance metrics.


=== EXAMPLE 4: Normal Solicitation ===

Title: "Janitorial Services"
Description: "The contractor shall provide daily janitorial services including trash
removal, restroom cleaning, and floor maintenance for a 50,000 sq ft office building."

CORRECT ANALYSIS:
{
  "has_requirement_sprawl": "no",
  "requirement_sprawl_examples": [],
}

WHY: Simple, focused requirements appropriate for the scope.

END OF ANNOTATED EXAMPLES
""",
        "cache_control": {"type": "ephemeral"}
    }

    # Block 3: Extraction Instructions (smallest, might iterate)
    extraction_instructions = {
        "type": "text",
        "text": """EXTRACTION TASK

Analyze each government solicitation document and extract the following information:

1. has_requirement_sprawl (string: "yes", "no", or "uncertain")
   - Answer "yes" if the solicitation includes excessive or unnecessary requirements
   - Answer "no" if requirements are focused and mission-appropriate
   - Answer "uncertain" if you cannot determine from available information

2. requirement_sprawl_examples (array of strings)
   - If question 1 is "yes", provide 3-5 specific examples of excessive requirements
   - Each example should quote or paraphrase the requirement and briefly explain why it's excessive
   - If question 1 is "no" or "uncertain", return empty array []

3. compliance_over_outcomes (string: "yes", "no", or "uncertain")
   - Answer "yes" if the solicitation uses compliance over outcomes or the QA process involves excessive paperwork
   - Answer "no" if QA process is appropriate and focused on outcomes instead of compliance
   - Answer "uncertain" if you cannot determine from available information

4. compliance_over_outcomes_examples (array of strings)
   - If question 3 is "yes", provide 1-2 specific examples of compliance paperwork being asked for
   - Each example should quote or paraphrase the requirement and briefly explain why it's excessive
   - If question 1 is "no" or "uncertain", return empty array []

5. contract_type (string or null)
   - Identify if mentioned: "FFP" (Firm Fixed Price), "T&M" (Time and Materials),
     "CPFF" (Cost Plus Fixed Fee), "IDIQ", etc.
   - Return null if not specified

6. performance_period (string or null)
    - Extract the contract performance period/period of performance
    - Return as found in document (e.g., "12 months", "Base year plus 4 option years")
    - Return null if not mentioned

7. evaluation_criteria (array of strings)
    - Extract the evaluation criteria if any are listed in the solicitation,
        such as: "PROFESSIONAL QUALIFICATIONS", "Price", "Past Performance", "Technical Proposal"
    - Return them in the order of importance from the solicitation, with each criterion as one string in the array/
    - Be specific and quote the creiteria in the solicitation exactly, if there are any.

CRITICAL INSTRUCTIONS:
- Return your response as valid JSON only, with no additional text
- Use the exact field names specified above
- Be conservative: if uncertain, say "uncertain" or use null rather than guessing
- For requirement_sprawl_examples and compliance_over_outcomes_examples, be specific and quote actual requirements

RESPONSE FORMAT:
{
  "has_requirement_sprawl": "yes|no|uncertain",
  "requirement_sprawl_examples": ["example 1", "example 2", ...],
  "compliance_over_outcomes": "yes|no|uncertain",
  "compliance_over_outcomes_examples": ["example 1", "example 2", ...],
  "contract_type": "type or null",
  "performance_period": "period or null",
  "evaluation_criteria": ["criteria 1", "criteria 2", ...]
}
""",
        "cache_control": {"type": "ephemeral"}
    }

    return [teaching_articles, annotated_examples, extraction_instructions]

def process_opportunities(input_file, output_file):
    """Process each opportunity record, query Claude API with caching, and update records."""

    # Initialize the Claude client
    client = anthropic.Anthropic(
        api_key=userdata.get('ANTHROPIC_API_KEY')
    )

    # Build the cached system prompt (reused across all API calls)
    cached_system_prompt = build_cached_system_prompt()

    # Load the opportunities data
    with open(input_file, 'r') as f:
        opportunities = json.load(f)

    # Track cache performance
    cache_stats = {
        "cache_creation_tokens": 0,
        "cache_read_tokens": 0,
        "input_tokens": 0
    }

    for idx, opportunity in enumerate(opportunities):
        print(f"\n{'='*80}")
        print(f"Processing {idx + 1}/{len(opportunities)}: {opportunity['title']}")
        print(f"{'='*80}")

        # Extract description
        description_html = opportunity['description']

        # Build the user message content
        content_parts = [
            {
                "type": "text",
                "text": f"""Analyze this government solicitation:

TITLE: {opportunity['title']}

POSTED DATE: {opportunity['postedDate']}

OFFICE: {opportunity['office']}

DESCRIPTION:
{description_html}
"""
            }
        ]

         # Process attachments if they exist
        if opportunity.get('attachments', 0) > 0 and opportunity.get('downloadedFiles'):
            for file in opportunity['downloadedFiles']:
                file_path = Path(input_file).parent / 'drive' / 'MyDrive' / 'attachments_2026_01_21_2' / 'attachments' / file
                file_data = read_file_as_base64(file_path)

                if file_data:
                    file_lower = file.lower()
                    print(f"  - Attached file: {file}")

                    # Handle images
                    if file_lower.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
                        ext = file.split('.')[-1].lower()
                        media_type_map = {
                            'jpg': 'image/jpeg',
                            'jpeg': 'image/jpeg',
                            'png': 'image/png',
                            'gif': 'image/gif',
                            'webp': 'image/webp'
                        }
                        media_type = media_type_map.get(ext, 'image/jpeg')

                        content_parts.append({
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": media_type,
                                "data": file_data
                            }
                        })

                    # Handle documents (PDF, Word, Excel, etc.)
                    elif file_lower.endswith(('.pdf', '.doc', '.docx')):
                    #elif file_lower.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.csv')):
                    # Does not currently handle .xlsx, .xls, or .csv files.
                    # Need to convert to documents first before passing in
                        # Map file extensions to MIME types
                        document_media_types = {
                            'pdf': 'application/pdf',
                            'doc': 'application/msword',
                            'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                            #'xls': 'application/vnd.ms-excel',
                            #'xlsx': 'application/pdf',
                            #'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                            'txt': 'text/plain'
                            #'csv': 'text/csv'
                        }

                        ext = file.split('.')[-1].lower()
                        media_type = document_media_types.get(ext, 'application/octet-stream')

                        content_parts.append({
                            "type": "document",
                            "source": {
                                "type": "base64",
                                "media_type": media_type,
                                "data": file_data
                            }
                        })

                    else:
                        # Skip unsupported file types
                        print(f"    ⚠ Skipping unsupported file type: {file}")
                        continue


        # Prepare messages
        messages = [
            {
                "role": "user",
                "content": content_parts
            }
        ]

        # Add retry logic for rate limit errors
        max_retries = 3
        retry_count = 0

        while retry_count <= max_retries:
            try:
                # Send request to Claude API with cached system prompt
                response = client.messages.create(
                    model="claude-sonnet-4-5-20250929",
                    max_tokens=2048,
                    system=cached_system_prompt,  # This gets cached!
                    messages=messages
                )

                # Track token usage
                usage = response.usage
                cache_stats["input_tokens"] += usage.input_tokens
                if hasattr(usage, 'cache_creation_input_tokens') and usage.cache_creation_input_tokens:
                    cache_stats["cache_creation_tokens"] += usage.cache_creation_input_tokens
                if hasattr(usage, 'cache_read_input_tokens') and usage.cache_read_input_tokens:
                    cache_stats["cache_read_tokens"] += usage.cache_read_input_tokens

                print(f"  - Tokens: {usage.input_tokens} input, {usage.output_tokens} output")
                if hasattr(usage, 'cache_read_input_tokens') and usage.cache_read_input_tokens:
                    print(f"  - Cache hit! Read {usage.cache_read_input_tokens} tokens from cache")
                if hasattr(usage, 'cache_creation_input_tokens') and usage.cache_creation_input_tokens:
                    print(f"  - Cache miss. Created cache with {usage.cache_creation_input_tokens} tokens")

                # Extract Claude's response
                claude_response = response.content[0].text

                # Parse JSON response
                try:
                    # Clean up response if it has markdown code blocks
                    cleaned_response = claude_response.strip()
                    if cleaned_response.startswith('```'):
                        # Remove markdown code fences
                        cleaned_response = cleaned_response.split('```')[1]
                        if cleaned_response.startswith('json'):
                            cleaned_response = cleaned_response[4:]
                        cleaned_response = cleaned_response.strip()

                    analysis_result = json.loads(cleaned_response)

                    # Add all extracted fields to the opportunity
                    opportunity.update(analysis_result)

                    print(f"  ✓ Successfully analyzed")
                    print(f"    - Requirement sprawl: {analysis_result['has_requirement_sprawl']}")

                except json.JSONDecodeError as e:
                    print(f"  ✗ Failed to parse JSON response: {e}")
                    print(f"  Raw response: {claude_response[:200]}...")
                    # Add null values for all fields
                    opportunity.update({
                        "has_requirement_sprawl": "uncertain",
                        "requirement_sprawl_examples": [],
                        "compliance_over_outcomes": "uncertain",
                        "compliance_over_outcomes_examples": [],
                        "contract_type": None,
                        "performance_period": None,
                        "parse_error": str(e)
                    })

                # Success - break out of retry loop
                break

            except anthropic.RateLimitError as e:
                if retry_count < max_retries:
                    retry_count += 1
                    wait_time = 60
                    print(f"  ⚠ Rate limit reached. Waiting {wait_time} seconds (retry {retry_count}/{max_retries})...")
                    time.sleep(wait_time)
                else:
                    print(f"  ✗ Max retries reached after rate limit errors")
                    opportunity.update({
                        "has_requirement_sprawl": "uncertain",
                        "requirement_sprawl_examples": [],
                        "compliance_over_outcomes": "uncertain",
                        "compliance_over_outcomes_examples": [],
                        "contract_type": None,
                        "performance_period": None,
                        "error": "rate_limit"
                    })
                    break

            except Exception as e:
                print(f"  ✗ Error: {e}")
                opportunity.update({
                    "has_requirement_sprawl": "uncertain",
                    "requirement_sprawl_examples": [],
                    "compliance_over_outcomes": "uncertain",
                    "compliance_over_outcomes_examples": [],
                    "contract_type": None,
                    "performance_period": None,
                    "error": str(e)
                })
                break

        # Save intermediate results every 10 opportunities
        if (idx + 1) % 10 == 0:
            with open(output_file, 'w') as f:
                json.dump(opportunities, f, indent=2)
            print(f"\n  → Intermediate save completed ({idx + 1} processed)")

    # Write final results
    with open(output_file, 'w') as f:
        json.dump(opportunities, f, indent=2)

    # Print cache statistics
    print(f"\n{'='*80}")
    print("PROCESSING COMPLETE")
    print(f"{'='*80}")
    print(f"Results saved to: {output_file}")
    print(f"\nCache Performance:")
    print(f"  - Cache creation tokens: {cache_stats['cache_creation_tokens']:,}")
    print(f"  - Cache read tokens: {cache_stats['cache_read_tokens']:,}")
    print(f"  - Regular input tokens: {cache_stats['input_tokens']:,}")

    if cache_stats['cache_read_tokens'] > 0:
        total_cached = cache_stats['cache_creation_tokens'] + cache_stats['cache_read_tokens']
        savings = cache_stats['cache_read_tokens'] * 0.9  # 90% savings on cache reads
        print(f"  - Estimated savings: ~{savings:,.0f} tokens (90% off cached reads)")

if __name__ == "__main__":
    input_file = "updated_opportunities_2.json"
    output_file = "analyzed_opportunities_2026_01-21.json"
    process_opportunities(input_file, output_file)

In [None]:
import json

# Load the JSON data from the file
input_filename = ''
output_filename = ''

with open(input_filename, 'r') as file:
    data = json.load(file)

# # Filter records where is_sole_source is null
# filtered_records = []
# for record in data:
#     if record.get('is_sole_source') is None:
#         # Create a new record without is_sole_source and awardee fields
#         new_record = {k: v for k, v in record.items() if k not in ['is_sole_source', 'awardee']}
#         filtered_records.append(new_record)

# # Save the filtered records to the new file
# with open(output_filename, 'w') as file:
#     json.dump(filtered_records, file, indent=2)

# print(f"Processing complete. {len(filtered_records)} records saved to {output_filename}")

In [None]:
#!cp -r "content/opportunities_processed_2025_03_12.json" "/content/drive/My Drive/opportunities_processed_2025_03_12.json"