In [11]:
# file handling
import glob
import os
from dotenv import load_dotenv
import re
import unidecode
import sys

# image handling
import base64

# AI API
import anthropic

# parsing
import csv
from io import StringIO

# tabular data
import numpy as np
import pandas as pd

# date handling
import locale
import datetime as dt

In [12]:
# User defined functions
# Set path for scripts folder
path_scripts = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(path_scripts)
from src import custom_logging as clogs, config, processing, postprocessing

In [13]:
# Load environment variables from .env file
load_dotenv()

True

In [14]:
logger = clogs.get_logger()

In [15]:
# Replace original path setup with:
batch_id = "01_2025_03"
batch_paths = config.ensure_batch_paths(batch_id)

# Use the paths in your code
folder_img_path = batch_paths['img']
folder_sg_path = batch_paths['sg_excel']
folder_output = batch_paths['output']

In [16]:
# Get configuration
patterns = config.get_file_patterns()
columns = config.get_column_settings()
settings = config.get_data_settings()

# Find the file matching the pattern
file_path = glob.glob(os.path.join(folder_sg_path, patterns['sg_file']))[0]

# Read the matched file
data_sg = pd.read_excel(
    file_path,
    header=settings['excel_settings']['header_row']
)

# Rename columns using config
data_sg = data_sg.rename(
    columns=columns['rename_map']
).copy()

# Select and drop NA using config
data_sg = data_sg[columns['sg_columns']].dropna().copy()

# Format date using config
data_sg["Fecha Parto"] = data_sg["Fecha Parto"].dt.strftime(
    settings['date_formats']['output'])
data_sg.info()
data_sg.head(2)


<class 'pandas.core.frame.DataFrame'>
Index: 73 entries, 0 to 72
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Número animal  73 non-null     object
 1   Fecha Parto    73 non-null     object
dtypes: object(2)
memory usage: 1.7+ KB


Unnamed: 0,Número animal,Fecha Parto
0,321/5,1/03/2024
1,0836/3,28/03/2024


In [17]:
conf_level = 99.75
prompt_input = f"""Instruction 1: Convert the text in the image to csv.
Instruction 2: Employ a strict approach: add 1 asterisk next to the estimated values for those cells whose text-to-digit conversion are below a {conf_level} percent confidence threshold; it does not matter if data is over-flagged.
Instruction 3: Include in comments the confidence threshold used.
Instruction 4: Do not use outlier-detection as criteria to flag the data.
Instruction 5: Make sure to not use outlier-detection as criteria to flag the data.
Instruction 6: If headers are present, include them. If no headers are found, do not include any.
Instruction 7: Include any comments before returning output. Limit verbosity.
Instruction 8: Return output enclosed in brackets to facilitate parsing.
Instruction 9: Do not include any additional comments after final output.
"""

In [19]:
print(sorted(os.listdir(folder_img_path)))
print(glob.glob(os.path.join(folder_sg_path, patterns['sg_file'])))

['.DS_Store', 'Escaner_20250119_1.jpg', 'Escaner_20250119_2.jpg', 'Escaner_20250119_3.jpg']
['/Users/manuel/gdrive/prgrmmng/fundo_vt/form2tab/_data/01_2025_03/2_sg_excel/Fecha parto_ene 13-19.xlsx']


In [9]:
data_list = []
cols_list = []
year = settings['year']

class NoColsError(Exception):
    pass

for filename in sorted(os.listdir(folder_img_path)):
    if filename.endswith(('.jpeg', '.jpg')):
        clogs.log_file_processing(logger, filename)
        clogs.log_column_status(logger, "initialized", cols_list)

        image_path = os.path.join(folder_img_path, filename)

        try:
            result = processing.extract_img2text(image_path, prompt_input)
            clogs.log_api_comment(logger, result.content[0].text)
        except Exception as e:
            print(f"An error occurred: {e}")
            continue  # Skip to next file if there's an error

        data_string = result.content[0].text.split("[")[1].replace("]", "")
        parsed_data = postprocessing.parse_csv_string(data_string)

        if not cols_list:
            if any("vaca" in s.lower() for s in parsed_data[0]):
                cols_list = parsed_data[0]
                clogs.log_column_status(logger, "initialized", cols_list)
            else:
                clogs.log_validation_error(logger, parsed_data[0], "No 'vaca' found")
                raise NoColsError("No columns found: Check image folder")
        else:
            if any("vaca" in s.lower()
                   for s in parsed_data[0]) and (cols_list != parsed_data[0]):
                cols_list = parsed_data[0]
                clogs.log_column_status(logger, "updated", cols_list)
            else:
                clogs.log_column_status(logger, "current (no update)", cols_list)

        # Create the DataFrame
        try:
            data_df = pd.DataFrame(parsed_data[1:], columns=cols_list)
            clogs.log_dataframe_creation(logger, success=True)
        except Exception as e:
            clogs.log_dataframe_creation(logger, success=False, error=str(e))
            clogs.log_column_list(logger, cols_list, "Current columns")
            
            for c in parsed_data[1:]:
                if len(c) > len(cols_list):
                    logger.error(f"Row longer than columns: {c}")
                    break
            
            logger.warning("Breaking processing loop due to DataFrame creation error")
            break

        clogs.log_column_status(logger, "at end of iteration", cols_list)

        data_df['flag_count'] = postprocessing.calculate_flag_counts(data_df)

        col_label_num = 1
        for col in data_df.iloc[:, 3:10].columns.tolist():

            data_df[col] = postprocessing.clean_column_values(data_df[col])

            # Get the index of the specified column
            col_index = data_df.columns.get_loc(col)

            col_label_str = postprocessing.normalize_month(
                postprocessing.normalize_day(col.replace(".", "")))

            # Insert a new column with the column name as the constant value
            data_df.insert(col_index, f'Fecha {col_label_num}',
                           postprocessing.convert_to_date(
                               col_label_str, year = year))
            col_label_num += 1
            data_df = data_df.rename(columns={col: "Kg/Leche"}).copy()

        data_df = data_df.drop(
            columns=["Nombre", "Becerro", "Fecha PP", "#"],
            errors="ignore").copy()

        data_df = data_df.rename(columns={
            data_df.columns[0]: "Número animal"
        }).copy()

        data_df["Número animal"] = data_df["Número animal"].str.replace(
            "-", "/").copy()
        
        clogs.log_dataframe_columns(logger, "data_df", data_df.columns.tolist())
        data_final = data_df.merge(data_sg, on="Número animal", how="left")
        clogs.log_dataframe_columns(logger, "data_final", data_final.columns.tolist())
        data_final["Fecha Parto"] = data_final["Fecha Parto"].fillna(
            "X*").copy()

        # Reorder columns
        cols_to_move = ["Número animal", "Fecha Parto"]
        data_final = processing.reorder_columns(data_final, cols_to_move)

        clogs.log_column_list(logger, data_final.columns.tolist())
        data_list.append(data_final)
        clogs.log_process_separator(logger)

2025-01-31 15:17:00 - INFO - Processing file: Escaner_20250119_1.jpg
2025-01-31 15:17:00 - DEBUG - cols_list initialized: []
2025-01-31 15:17:24 - INFO - API Comment: Comments: Confidence threshold of 99.75% used for flagging uncertain values with an asterisk (*).
2025-01-31 15:17:24 - DEBUG - cols_list initialized: ['Número vaca', 'Nombre', 'Becerro', 'Ene. Lunes 13', 'Ene. Martes 14', 'Ene. Miérc. 15', 'Ene. Jueves 16', 'Ene. Vierne 17', 'Ene. Sáb. 18', 'Ene. Dom. 19', '#']
2025-01-31 15:17:24 - INFO - Dataframe successfully created
2025-01-31 15:17:24 - DEBUG - cols_list at end of iteration: ['Número vaca', 'Nombre', 'Becerro', 'Ene. Lunes 13', 'Ene. Martes 14', 'Ene. Miérc. 15', 'Ene. Jueves 16', 'Ene. Vierne 17', 'Ene. Sáb. 18', 'Ene. Dom. 19', '#']
2025-01-31 15:17:24 - DEBUG - data_df columns: ['Número animal', 'Fecha 1', 'Kg/Leche', 'Fecha 2', 'Kg/Leche', 'Fecha 3', 'Kg/Leche', 'Fecha 4', 'Kg/Leche', 'Fecha 5', 'Kg/Leche', 'Fecha 6', 'Kg/Leche', 'Fecha 7', 'Kg/Leche', 'flag_cou

In [10]:
regular_file, final_file = postprocessing.export_data(
    data_list=data_list,
    folder_output=folder_output,
    batch_id=batch_id
    )