In [None]:
import requests
import json
import os
import re
from urllib.parse import urlparse, unquote
from pathlib import Path

def process_opportunities(api_key, posted_from, posted_to, output_dir='attachments'):
    """
    Fetch and process opportunities, returning structured data and downloading attachments
    """
    opportunities = []
    base_url = "https://api.sam.gov/prod/opportunities/v2/search"

    params = {
        'api_key': api_key,
        'limit': 1000,
        'postedFrom': posted_from,
        'postedTo': posted_to,
        'ptype': 'u,p,a,r,s,o,k,i',
        'organizationCode': "070"
        # u= Justification (J&A)
        # p = Pre solicitation
        # a = Award Notice
        # r = Sources Sought
        # s = Special Notice
        # o = Solicitation
        # k = Combined Synopsis/Solicitation
        # i = Intent to Bundle Requirements (DoD-Funded)
        # https://open.gsa.gov/api/get-opportunities-public-api/
        # included all except g = Sale of Surplus Property
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        results = response.json()

        # Create output directory if it doesn't exist
        Path(output_dir).mkdir(exist_ok=True)

        for opp in results.get('opportunitiesData', []):

            # Process attachments
            resource_links = opp.get('resourceLinks', [])
            downloaded_files = []

            if resource_links:
                # Limit to first 5 attachments
                for index, url in enumerate(resource_links[:5]):
                    try:
                        response = requests.get(url, stream=True)
                        response.raise_for_status()

                        # Create filename
                        original_filename = response.headers['Content-Disposition'][21:]
                        noticeId = opp.get('noticeId', 'N/A')
                        filename = f"{noticeId}){sanitize_filename(original_filename)}"

                        # base_filename = f"{sanitize_filename(opp['title'])}_{opp['noticeId']}_doc{index+1}"

#                         # Get extension from response
#                         content_type = response.headers.get('content-type', '')
#                         ext = get_file_extension(response)
#                         filename = f"{base_filename}{ext}"


                        # Save file
                        filepath = Path(output_dir) / filename
                        with open(filepath, 'wb') as f:
                            for chunk in response.iter_content(chunk_size=8192):
                                if chunk:
                                    f.write(chunk)

                        downloaded_files.append(filename)

                    except Exception as e:
                        print(f"Error downloading attachment {index+1}: {e}")

            # Structure the opportunity data
            opportunity = {
                'title': opp.get('title', 'Untitled'),
                'noticeId': opp.get('noticeId', 'N/A'),
                'postedDate': opp.get('postedDate', 'N/A'),
                'office': opp.get('fullParentPathName', 'Not specified'),
                'attachments': len(resource_links) if resource_links else 0,
                'downloadedFiles': downloaded_files
            }

            opportunities.append(opportunity)

        return opportunities

    except Exception as e:
        print(f"Error processing opportunities: {e}")
        return []

def sanitize_filename(filename):
    """Remove invalid characters from filename"""
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

# def get_file_extension(response):
#     """Determine file extension from response headers"""
#     content_type = response.headers.get('content-type', '')
#     if 'application/pdf' in content_type:
#         return '.pdf'
#     elif 'application/msword' in content_type:
#         return '.doc'
#     elif 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' in content_type:
#         return '.docx'
#     elif 'text/plain' in content_type:
#         return '.txt'
#     return '.pdf'  # Default to PDF if unknown

def main():
    #api_key = ""
    api_key=userdata.get('SAM_API_KEY')
    posted_from = "05/01/2024"
    posted_to = "05/30/2024"

    opportunities = process_opportunities(api_key, posted_from, posted_to)

    # Save structured data to JSON
    with open('opportunities_2.json', 'w') as f:
        json.dump(opportunities, f, indent=2)

    print(f"Processed {len(opportunities)} opportunities")
    print("Data saved to opportunities_2.json")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import json
import csv
import requests
from io import StringIO

# Load the opportunities JSON data
with open('opportunities_2.json', 'r') as json_file:
    opportunities = json.load(json_file)

# Extract all noticeIds from opportunities.json
notice_ids_to_find = {opportunity['noticeId'] for opportunity in opportunities}

# Dictionary to store found descriptions
notice_descriptions = {}

# URL of the CSV file
csv_url = "https://sam.gov/api/prod/fileextractservices/v1/api/download/Contract%20Opportunities/datagov/ContractOpportunitiesFullCSV.csv?privacy=Public"

try:
    # Download the CSV data
    response = requests.get(csv_url)
    response.raise_for_status()  # Raise exception for 4XX/5XX responses

    # Read the CSV content from the response
    csv_content = StringIO(response.text)

    # Process the CSV data
    csv_reader = csv.DictReader(csv_content)
    for row in csv_reader:
        if row['NoticeId'] in notice_ids_to_find:
            notice_descriptions[row['NoticeId']] = row['Description']
            # If we've found all the noticeIds we need, we can stop reading the CSV
            if len(notice_descriptions) == len(notice_ids_to_find):
                break

except requests.exceptions.RequestException as e:
    print(f"Error downloading CSV: {e}")
    exit(1)

# Update opportunities with descriptions
for opportunity in opportunities:
    notice_id = opportunity['noticeId']
    if notice_id in notice_descriptions:
        opportunity['description'] = notice_descriptions[notice_id]
    else:
        opportunity['description'] = ""  # Add empty description if no match found

# Save the updated opportunities
with open('updated_opportunities_2.json', 'w') as json_file:
    json.dump(opportunities, indent=2, fp=json_file)

print(f"Successfully updated {len(notice_descriptions)} out of {len(notice_ids_to_find)} opportunities with descriptions")

In [None]:
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create a folder in the root directory
!mkdir -p "/content/drive/My Drive/attachments_2026_01_21_2"

!cp -r "/content/attachments/" "/content/drive/My Drive/attachments_2026_01_21_2"

# directory = "/content/attachments/"
# for filename in os.listdir(directory):
#     f = os.path.join(directory, filename)
#     # check if current path is a file
#     if os.path.isfile(f):
#         print(f)

In [None]:
# !rm -rf "/content/attachments/"
# !rm -rf "/content/opportunities.json"
# !rm -rf "/content/updated_opportunities.json

In [None]:
#!cp -r "/content/drive/My Drive/attachments_2025_03_03" "/content/attachments/"
