In [5]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import List, Dict, Optional, Generator
from decimal import Decimal
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class ARCOSRecord:
    drug: str
    state_code: str
    state_name: str
    zip_code: str
    q1: Decimal
    q2: Decimal
    q3: Decimal
    q4: Decimal
    total: Decimal

class ARCOSReportParser:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.current_drug: Optional[str] = None
        self.current_state: Optional[tuple] = None
        self.records: List[ARCOSRecord] = []
        
    def extract_text_from_pdf(self) -> Generator[str, None, None]:
        """Extract text from PDF file page by page."""
        try:
            with pdfplumber.open(self.pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    text = page.extract_text()
                    if text:
                        logger.info(f"Processing page {page_num}")
                        yield text
                    else:
                        logger.warning(f"Empty text on page {page_num}")
        except Exception as e:
            logger.error(f"Error extracting PDF: {str(e)}")
            raise

    def parse_numbers(self, line: str) -> List[Decimal]:
        """Extract all numbers from a line and convert to Decimal."""
        numbers = re.findall(r'[\d,\.]+', line)
        return [Decimal(num.replace(',', '')) for num in numbers]

    def parse_drug_line(self, line: str) -> Optional[str]:
        """Parse a line containing drug information."""
        drug_match = re.search(r'DRUG:\s*(\d+)\s*-\s*(.+)$', line)
        if drug_match:
            drug_code, drug_name = drug_match.groups()
            return f"{drug_code} - {drug_name.strip()}"
        return None

    def parse_state_line(self, line: str) -> Optional[tuple]:
        """Parse a line containing state information."""
        state_match = re.search(r'STATE:\s*(\w+)\s*-\s*(.+)$', line)
        if state_match:
            return (state_match.group(1), state_match.group(2).strip())
        return None

    def parse_data_line(self, line: str) -> Optional[ARCOSRecord]:
        """Parse a data line containing ZIP code and quarterly data."""
        try:
            # Skip total lines and header lines
            if ' Total' in line or 'REGISTRANT ZIP CODE' in line or 'QUARTER' in line:
                return None

            parts = line.split()
            if len(parts) >= 5 and parts[0].isdigit():
                numbers = self.parse_numbers(line)
                if len(numbers) >= 5 and self.current_drug and self.current_state:
                    return ARCOSRecord(
                        drug=self.current_drug,
                        state_code=self.current_state[0],
                        state_name=self.current_state[1],
                        zip_code=parts[0],
                        q1=numbers[0],
                        q2=numbers[1],
                        q3=numbers[2],
                        q4=numbers[3],
                        total=numbers[4]
                    )
        except Exception as e:
            logger.warning(f"Error parsing data line: {line}\nError: {str(e)}")
        return None

    def parse_page(self, text: str):
        """Parse a single page of the report."""
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        
        for line in lines:
            # Check for new drug
            drug = self.parse_drug_line(line)
            if drug:
                self.current_drug = drug
                logger.info(f"Found new drug: {drug}")
                continue

            # Check for new state
            state_info = self.parse_state_line(line)
            if state_info:
                self.current_state = state_info
                logger.info(f"Found new state: {state_info[0]} - {state_info[1]}")
                continue

            # Parse data line
            record = self.parse_data_line(line)
            if record:
                self.records.append(record)

    def parse_pdf(self) -> pd.DataFrame:
        """Parse the entire PDF report and return a pandas DataFrame."""
        try:
            for page_text in self.extract_text_from_pdf():
                self.parse_page(page_text)
            
            # Convert records to DataFrame
            df = pd.DataFrame([vars(record) for record in self.records])
            
            # Reorder columns for clarity
            column_order = ['drug', 'state_code', 'state_name', 'zip_code', 
                          'q1', 'q2', 'q3', 'q4', 'total']
            df = df[column_order]
            
            return df
            
        except Exception as e:
            logger.error(f"Error parsing PDF: {str(e)}")
            raise

def process_arcos_report(pdf_path: str, output_path: Optional[str] = None) -> pd.DataFrame:
    """
    Process an ARCOS report and optionally save to file.
    
    Args:
        pdf_path: Path to the PDF file
        output_path: Optional path to save CSV output
        
    Returns:
        pandas.DataFrame containing the parsed data
    """
    try:
        # Parse the PDF
        parser = ARCOSReportParser(pdf_path)
        df = parser.parse_pdf()
        
        # Print summary
        print("\nData Summary:")
        print(f"Total records: {len(df)}")
        print(f"Unique drugs: {df['drug'].nunique()}")
        print(f"Unique states: {df['state_code'].nunique()}")
        print("\nSample of the data:")
        print(df.head())
        
        # Save to file if requested
        if output_path:
            df.to_csv(output_path, index=False)
            print(f"\nData saved to: {output_path}")
            
        return df
        
    except Exception as e:
        logger.error(f"Error processing report: {str(e)}")
        raise


In [7]:
# Initialize parser with your PDF file
# parser = ARCOSReportParser("TEST_REPORT.pdf")

# # Parse the PDF
# states_data = parser.parse_pdf()
process_arcos_report("TEST_REPORT.pdf", "data/scraped/2020-dea.csv")

INFO:__main__:Processing page 1
INFO:__main__:Found new drug: 1100 - AMPHETAMINE
INFO:__main__:Found new state: AK - ALASKA
INFO:__main__:Found new state: AL - ALABAMA
INFO:__main__:Found new state: AR - ARKANSAS
INFO:__main__:Found new state: AS - AMERICAN SAMOA
INFO:__main__:Found new state: AZ - ARIZONA
INFO:__main__:Processing page 2
INFO:__main__:Found new state: CA - CALIFORNIA
INFO:__main__:Processing page 3
INFO:__main__:Found new state: CO - COLORADO
INFO:__main__:Found new state: CT - CONNECTICUT
INFO:__main__:Found new state: DC - DISTRICT OF COLUMBIA
INFO:__main__:Found new state: DE - DELAWARE
INFO:__main__:Found new state: FL - FLORIDA
INFO:__main__:Processing page 4
INFO:__main__:Found new state: GA - GEORGIA
INFO:__main__:Found new state: GU - GUAM
INFO:__main__:Found new state: HI - HAWAII
INFO:__main__:Found new state: IA - IOWA
INFO:__main__:Processing page 5
INFO:__main__:Found new state: ID - IDAHO
INFO:__main__:Found new state: IL - ILLINOIS
INFO:__main__:Processi


Data Summary:
Total records: 12875
Unique drugs: 14
Unique states: 55

Sample of the data:
                 drug state_code state_name zip_code   q1       q2       q3  \
0  1100 - AMPHETAMINE         AK     ALASKA      995  995  5097.87  4776.84   
1  1100 - AMPHETAMINE         AK     ALASKA      996  996  1787.55  1811.43   
2  1100 - AMPHETAMINE         AK     ALASKA      997  997  1189.97  1245.75   
3  1100 - AMPHETAMINE         AK     ALASKA      998  998   515.12   518.47   
4  1100 - AMPHETAMINE         AK     ALASKA      999  999   185.87   171.04   

        q4    total  
0  5091.24  5046.39  
1  2013.64  2011.10  
2  1317.81  1204.19  
3   610.39   590.51  
4   184.97   181.77  

Data saved to: data/scraped/2020-dea.csv


Unnamed: 0,drug,state_code,state_name,zip_code,q1,q2,q3,q4,total
0,1100 - AMPHETAMINE,AK,ALASKA,995,995,5097.87,4776.84,5091.24,5046.39
1,1100 - AMPHETAMINE,AK,ALASKA,996,996,1787.55,1811.43,2013.64,2011.10
2,1100 - AMPHETAMINE,AK,ALASKA,997,997,1189.97,1245.75,1317.81,1204.19
3,1100 - AMPHETAMINE,AK,ALASKA,998,998,515.12,518.47,610.39,590.51
4,1100 - AMPHETAMINE,AK,ALASKA,999,999,185.87,171.04,184.97,181.77
...,...,...,...,...,...,...,...,...,...
12870,9801 - FENTANYL BASE,WY,WYOMING,827,827,3.62,3.25,3.87,3.31
12871,9801 - FENTANYL BASE,WY,WYOMING,828,828,4.70,5.61,7.02,8.04
12872,9801 - FENTANYL BASE,WY,WYOMING,829,829,21.85,20.06,19.52,20.20
12873,9801 - FENTANYL BASE,WY,WYOMING,830,830,2.71,3.21,2.86,2.63
