In [None]:
!pip install pandas==1.5.3
!pip install openai==0.28.0

This code processes a CSV file using the GPT-4 API to generate text responses based on specific input data. It reads the CSV file using the `pandas` library and identifies the first empty row in the output column to determine the starting line for processing. It then iterates over a specified range of rows, combines a prompt template with content from a designated column, and calls the GPT-4 API to generate a response. The response is inserted into the output column of the CSV file. The code saves progress periodically and ensures the updated DataFrame is saved back to the CSV file. The main tools used are `pandas` for data manipulation and `openai` for interacting with the GPT-4 API. The objective is to automate the generation of text responses for rows in a CSV file based on input data, facilitating tasks such as text analysis or content generation.

In [None]:
import pandas as pd
import openai
from openai.error import APIError

# Set your OpenAI API key
openai.api_key = 'API_KEY'

# Function to call GPT-4 using the chat API
def callGPT4(message):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[{"role": "user", "content": message}],
            temperature=0.0
        )
        return response['choices'][0]['message']['content']
    except APIError as error:
        print(f"API Error: {error}")
        return None

# Function to process CSV using GPT API within a specified line range
def process_csv_with_gpt(csv_file_name, prompt_column, output_column, prompt_template, total_lines_to_process):
    # Read the CSV file
    df = pd.read_csv(csv_file_name)

    # Identify the start line based on the first empty row in the output column
    start_line = df[output_column].isna().idxmax()
    end_line = min(start_line + total_lines_to_process, len(df))

    # Iterate over the specified range of rows in the DataFrame
    for index, row in df.iloc[start_line:end_line].iterrows():
        # Combine the prompt template with the content of the specified column
        combined_message = prompt_template.format(content=row[prompt_column])

        # Call the GPT-4 chat API
        chat_response = callGPT4(combined_message)

        # Check for a valid response before updating the DataFrame
        if chat_response is not None:
            # Insert the response into the specified output column
            df.at[index, output_column] = chat_response

        # Save periodically after processing every 10 lines
        if (index - start_line + 1) % 10 == 0:
            df.to_csv(csv_file_name, index=False)
            print(f"Saved progress at line {index}")

    # Save the updated DataFrame back to the same CSV file after processing all lines
    df.to_csv(csv_file_name, index=False)
    print("Processing completed and saved.")

# Example usage
process_csv_with_gpt(
    csv_file_name='FILE_NAME.csv',
    prompt_column='anamnesis',
    output_column='AITextoRetornado',
    prompt_template="""
    PART1

    {content}

    PART2
    """,
    total_lines_to_process=20 # Total lines to process this run
)


Saved progress at line 9
Saved progress at line 19
Processing completed and saved.


In [None]:
# Mapping and extract json 1

import csv
import re

# Path to your original CSV file
csv_file_path = 'FILE_NAME.csv'

# Path to the new CSV file
new_csv_file_path = 'FILE_NAME_JSON1.csv'

# Reading the original CSV file
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    lines = list(csv_reader)

# Mapping from JSON keys to CSV column names
json_to_csv_mapping = {
    'suspeita': 'AI Anafilaxia',
    'alergenico': 'AI Alérgeno',
    'raciocinio': 'AI Raciocínio',
    'probabilidade': 'AI Probabilidade'
}

# Processing each line to extract data from the text
updated_lines = []
for line in lines:
    text = line['AITextoRetornado']

    # Extracting key-value pairs using regular expressions
    # This regex matches both boolean, textual, and numerical values
    matches = re.findall(r'"(\w+)":\s*(true|false|".+?"|[\d,]+(\.\d+)?)', text)

    # Updating the line with the extracted data
    for match in matches:
        key = match[0]
        value = match[1]
        # If the value is a string (enclosed in quotes), remove the quotes
        if value.startswith('"') and value.endswith('"'):
            value = value[1:-1]
        csv_key = json_to_csv_mapping.get(key)
        if csv_key in line:  # Only update if the key exists as a column
            line[csv_key] = value

    updated_lines.append(line)

# Writing the updated data back to a new CSV file
with open(new_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_reader.fieldnames)
    writer.writeheader()
    writer.writerows(updated_lines)

This code processes a CSV file using the GPT-4 API to generate text responses based on specific input data, conditioned on another column's value, and complements the first code by adding a double validation process. It reads the CSV file using the `pandas` library and identifies the first empty row in the output column to determine the starting line for processing. It then iterates over a specified range of rows, checks if the output column is empty and if a condition in another column is met, combines a prompt template with content from a designated column, and calls the GPT-4 API to generate a response. The response is inserted into the output column of the CSV file. The code saves progress periodically and ensures the updated DataFrame is saved back to the CSV file. The main tools used are `pandas` for data manipulation and `openai` for interacting with the GPT-4 API. The objective is to automate the generation of text responses for rows in a CSV file based on input data, facilitating tasks such as text analysis or content generation while considering specific conditions and performing double validation.

In [None]:
import pandas as pd
import openai
from openai.error import APIError

# Set your OpenAI API key
openai.api_key = 'API_KEY'

# Function to call GPT-4 using the chat API
def callGPT4(message):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": message}],
            temperature=0.0
        )
        return response['choices'][0]['message']['content']
    except APIError as error:
        print(f"API Error: {error}")
        return None

# Function to process CSV using GPT API within a specified line range
def process_csv_with_gpt(csv_file_name, prompt_column, output_column, condition_column, prompt_template, total_lines_to_process):
    # Read the CSV file
    df = pd.read_csv(csv_file_name)

    # Identify the start line based on the first empty row in the output column
    start_line = df[output_column].isna().idxmax()
    end_line = min(start_line + total_lines_to_process, len(df))

    # Iterate over the specified range of rows in the DataFrame
    for index, row in df.iloc[start_line:end_line].iterrows():
        # Check if the output column is empty and if the condition is met
        if pd.isna(row[output_column]) and row[condition_column]:
            # Combine the prompt template with the content of the specified column
            combined_message = prompt_template.format(content=row[prompt_column])

            # Call the GPT-4 chat API
            chat_response = callGPT4(combined_message)

            # Check for a valid response before updating the DataFrame
            if chat_response is not None:
                # Insert the response into the specified output column
                df.at[index, output_column] = chat_response

        # Save periodically after processing every 10 lines
        if (index - start_line + 1) % 10 == 0:
            df.to_csv(csv_file_name, index=False)
            print(f"Saved progress at line {index}")

    # Save the updated DataFrame back to the same CSV file after processing all lines
    df.to_csv(csv_file_name, index=False)
    print("Processing completed and saved.")

# Example usage
process_csv_with_gpt(
    csv_file_name='FILE_NAME_JSON1.csv',
    prompt_column='anamnesis',
    output_column='2-AITextoRetornado',
    condition_column='AI Anafilaxia',
    prompt_template="""
    Como um médico clínico geral, sua tarefa é examinar textos médicos e determinar se há sinais de anafilaxia no paciente. Esta avaliação é feita com base nos sintomas, sinais vitais e diagnósticos documentados pelos médicos.
    Analise o seguinte texto médico:

    {content}

    Como resultado da sua análise você deve:
    1. Informar se você suspeita que o paciente tem anafilaxia.
    2. Se você tem uma suspeita, explicar como os critérios 1 ou 2 são satisfeitos (cite trechos do texto). Se não, citar quais aspectos dos critérios 1 e 2 estão faltando.
    3. Represente, com um número de 0 a 1, a probabilidade do paciente ter anafilaxia, em sua opinião.
    Lembre de fazer a sua análise passo a passo.

    Baseando-se em sua análise, produza também um objeto JSON com as chaves:

    - "suspeita": com valor "true" se existe uma suspeita de anafilaxia ou "false" se não.
    - "raciocinio": explicação de seu raciocínio para concluir a suspeita ou ausência da anafilaxia.
    - "alergenico": a substância alergênica, se ela for identificada, ou a sentença 'Não identificado'.
    - "probabilidade": um valor de 0 a 1 indicando a probabilidade de ser anafilaxia, em sua opinião.

    Prossiga cuidadosamente.
    """,
    total_lines_to_process=10 # Total lines to process this run
)


Saved progress at line 19
Processing completed and saved.


In [None]:
# Mapping and extract json 2

import csv
import re

# Path to your original CSV file
csv_file_path = 'FILE_NAME_JSON1.csv'

# Path to the new CSV file
new_csv_file_path = 'FILE_NAME_JSON2.csv'

# Reading the original CSV file
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    lines = list(csv_reader)

# Mapping from JSON keys to CSV column names
json_to_csv_mapping = {
    'alergenico': '2-AI Alérgeno',
    'raciocinio': '2-AI Raciocínio',
    'suspeita': '2- AI Anafilaxia',
    'probabilidade': '2-AI Probabilidade'
}

# Processing each line to extract data from the text
updated_lines = []
for line in lines:
    text = line['2-AITextoRetornado']

    # Adjusting the regex to better capture boolean values
    matches = re.findall(r'"(\w+)":\s*(true|false|".*?"|\d+(\.\d+)?)', text)

    # Debugging: Print matches to see what is being extracted
    print(f"Matches for line: {matches}")

    # Updating the line with the extracted data
    for match in matches:
        key, value = match[0], match[1]
        if value.startswith('"') and value.endswith('"'):
            value = value[1:-1]  # Remove quotes if the value is a string
        csv_key = json_to_csv_mapping.get(key)
        if csv_key in line:  # Only update if the key exists as a column
            line[csv_key] = value

    updated_lines.append(line)

# Writing the updated data back to a new CSV file
with open(new_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_reader.fieldnames)
    writer.writeheader()
    writer.writerows(updated_lines)

In [None]:
#Categorization 1

import csv

# Path to your original CSV file
csv_file_path = 'FILE_NAME_JSON2.csv'

# Path to the new CSV file
new_csv_file_path = 'FILE_NAME_CATEGORIZED.csv'

# Reading the original CSV file
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    lines = list(csv_reader)

    # Check if 'categorizacao' column already exists
    if 'categorizacao' not in csv_reader.fieldnames:
        csv_reader.fieldnames.append('categorizacao')

# Adding the 'categorizacao' column and performing categorization
for line in lines:
    anafilaxia = line['Anafilaxia'].lower()
    ai_anafilaxia = line['AI Anafilaxia'].lower()

    # Convert 'verdadeiro' and 'falso' to 'true' and 'false'
    anafilaxia = 'true' if anafilaxia in ['true', 'verdadeiro'] else 'false'
    ai_anafilaxia = 'true' if ai_anafilaxia in ['true', 'verdadeiro'] else 'false'

    if ai_anafilaxia not in ['true', 'false']:
        line['categorizacao'] = 'Erro'
    else:
        if anafilaxia == 'true' and ai_anafilaxia == 'true':
            line['categorizacao'] = 'TP'
        elif anafilaxia == 'false' and ai_anafilaxia == 'false':
            line['categorizacao'] = 'TN'
        elif anafilaxia == 'false' and ai_anafilaxia == 'true':
            line['categorizacao'] = 'FP'
        elif anafilaxia == 'true' and ai_anafilaxia == 'false':
            line['categorizacao'] = 'FN'

    # Marking rows with empty 'AI Anafilaxia' as 'Erro'
    if not line['AI Anafilaxia'].strip():
        line['categorizacao'] = 'Erro'

# Writing the updated data back to a new CSV file
with open(new_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_reader.fieldnames)
    writer.writeheader()
    writer.writerows(lines)



In [None]:
#Categorization 2

import csv

# Path to your original CSV file
csv_file_path = 'FILE_NAME_CATEGORIZED.csv'

# Path to the new CSV file
new_csv_file_path = 'FILE_NAME_CATEGORIZED2.csv'

# Reading the original CSV file
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    lines = list(csv_reader)

    # Check if '2-categorizacao' column already exists
    if '2-categorizacao' not in csv_reader.fieldnames:
        csv_reader.fieldnames.append('2-categorizacao')

# Performing categorization
for line in lines:
    anafilaxia = line['Anafilaxia'].strip().lower()
    ai_anafilaxia = line.get('2- AI Anafilaxia', '').strip().lower()

    # If '2-AI Anafilaxia' is empty, copy the value from 'categorizacao' to '2-categorizacao'
    if not ai_anafilaxia:
        line['2-categorizacao'] = line['categorizacao']
    else:
        # Determine the correct categorization based on 'Anafilaxia' and '2-AI Anafilaxia'
        if anafilaxia == 'true' and ai_anafilaxia == 'true':
            line['2-categorizacao'] = 'TP'
        elif anafilaxia == 'false' and ai_anafilaxia == 'false':
            line['2-categorizacao'] = 'TN'
        elif anafilaxia == 'false' and ai_anafilaxia == 'true':
            line['2-categorizacao'] = 'FP'
        elif anafilaxia == 'true' and ai_anafilaxia == 'false':
            line['2-categorizacao'] = 'FN'

# Writing the updated data back to a new CSV file
with open(new_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_reader.fieldnames)
    writer.writeheader()
    writer.writerows(lines)


In [None]:
import csv

# Path to your original CSV file
csv_file_path = 'FILE_NAME_JSON2.csv'

# Path to the final CSV file
final_csv_file_path = 'FILE_NAME_CATEGORIZED.csv'

# Reading the original CSV file
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    lines = list(csv_reader)

    # Check if 'categorizacao' and '2-categorizacao' columns already exist
    if 'categorizacao' not in csv_reader.fieldnames:
        csv_reader.fieldnames.append('categorizacao')
    if '2-categorizacao' not in csv_reader.fieldnames:
        csv_reader.fieldnames.append('2-categorizacao')

# Adding the 'categorizacao' column and performing the first categorization
for line in lines:
    anafilaxia = line['Anafilaxia'].lower()
    ai_anafilaxia = line['AI Anafilaxia'].lower()

    # Convert 'verdadeiro' and 'falso' to 'true' and 'false'
    anafilaxia = 'true' if anafilaxia in ['true', 'verdadeiro'] else 'false'
    ai_anafilaxia = 'true' if ai_anafilaxia in ['true', 'verdadeiro'] else 'false'

    if ai_anafilaxia not in ['true', 'false']:
        line['categorizacao'] = 'Erro'
    else:
        if anafilaxia == 'true' and ai_anafilaxia == 'true':
            line['categorizacao'] = 'TP'
        elif anafilaxia == 'false' and ai_anafilaxia == 'false':
            line['categorizacao'] = 'TN'
        elif anafilaxia == 'false' and ai_anafilaxia == 'true':
            line['categorizacao'] = 'FP'
        elif anafilaxia == 'true' and ai_anafilaxia == 'false':
            line['categorizacao'] = 'FN'

    # Marking rows with empty 'AI Anafilaxia' as 'Erro'
    if not line['AI Anafilaxia'].strip():
        line['categorizacao'] = 'Erro'

# Performing the second categorization and adding the '2-categorizacao' column
for line in lines:
    anafilaxia = line['Anafilaxia'].strip().lower()
    ai_anafilaxia = line.get('2- AI Anafilaxia', '').strip().lower()

    # If '2-AI Anafilaxia' is empty, copy the value from 'categorizacao' to '2-categorizacao'
    if not ai_anafilaxia:
        line['2-categorizacao'] = line['categorizacao']
    else:
        # Determine the correct categorization based on 'Anafilaxia' and '2-AI Anafilaxia'
        if anafilaxia == 'true' and ai_anafilaxia == 'true':
            line['2-categorizacao'] = 'TP'
        elif anafilaxia == 'false' and ai_anafilaxia == 'false':
            line['2-categorizacao'] = 'TN'
        elif anafilaxia == 'false' and ai_anafilaxia == 'true':
            line['2-categorizacao'] = 'FP'
        elif anafilaxia == 'true' and ai_anafilaxia == 'false':
            line['2-categorizacao'] = 'FN'

# Writing the updated data back to a new CSV file
with open(final_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_reader.fieldnames)
    writer.writeheader()
    writer.writerows(lines)
