In [1]:
import requests
import pandas as pd
import pprint
import ssl
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import os
import time
from requests.exceptions import ReadTimeout
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm

In [2]:
# importando bibliotecas conexão PDGT
import json
import argparse
import subprocess
import boto3
import time
from pyathena import connect
import pandas.io.sql as sqlio
import sys
from ydata_profiling import ProfileReport
from botocore import UNSIGNED
from botocore.config import Config
import boto3.session
from botocore import exceptions
from scipy.stats import chi2_contingency

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class CustomException(Exception):
    pass

In [4]:
json_manifest_dbt = "target/manifest.json"
athena_bucket = "todos-athena-us-east-1"
athena_tmp_folder = f"s3://{athena_bucket}/"

In [5]:
def execute_athena_sql(query):
    client = boto3.client('athena', region_name='us-east-1')
    queryStart = client.start_query_execution(
    QueryString = query,
    ResultConfiguration = { 'OutputLocation': athena_tmp_folder})
    queryExecution = client.get_query_execution(QueryExecutionId=queryStart['QueryExecutionId'])
    while queryExecution['QueryExecution']['Status']['State'] in ('RUNNING', 'QUEUED'):
        time.sleep(5)
        queryExecution = client.get_query_execution(QueryExecutionId=queryStart['QueryExecutionId'])

In [6]:
def execute_athena_query(query):
    cursor = connect(s3_staging_dir=athena_tmp_folder,
                    region_name="us-east-1").cursor()
    cursor.execute(query)
    colls=','.join(str(f"{e[0]}") for e in cursor.description)
    results=pd.DataFrame(list(cursor), columns=colls.split(","))
    return results

In [7]:
query_endereco = """
--BASE PACIENTES
--SELECT count(*) from (
WITH agg_pacientes AS (
    SELECT
        ag.id_paciente,
        ag.cpf,
        MIN(ag.dt_agendamento) AS min_dt_ag,
        MAX(ag.dt_agendamento) AS max_dt_ag,
        DATE_DIFF('year', MIN(ag.dt_agendamento), current_date) AS anos_utilizacao,
        DATE_DIFF('month', MIN(ag.dt_agendamento), current_date) AS meses_utilizacao,
        DATE_DIFF('day', MAX(ag.dt_agendamento), current_date) AS dias_ult_utilizacao,
        --DATE_DIFF('month', MAX(ag.dt_agendamento), current_date) AS dias_ult_utilizacao,
        COALESCE(utlz.qtd_utilizacao, 0) AS qtd_utilizacao,
        COALESCE(cst.qtd_consultas, 0) AS qtd_consultas,
        --COALESCE(vexpc.qtd_exam_proc, 0) AS qtd_exam_proc,
        COALESCE(oe.qtd_orc_executado, 0) AS qtd_exam_proc,
        COALESCE(ptcl.qtd_particular, 0) AS qtd_particular,
        COALESCE(cst.tt_consulta, 0) AS tt_consulta,
        COALESCE(cst.tm_consulta, 0) AS tm_consulta,
        --COALESCE(expc.tt_exam_proc, 0) AS tt_exam_proc,
        COALESCE(oe.tt_orc_executado, 0) AS tt_exam_proc,
        COALESCE(one.qtd_orc_nao_executado, 0) AS qtd_orc_nao_executado,
        COALESCE(one.tt_orc_nao_executado, 0) AS tt_orc_nao_executado,
        CASE WHEN COALESCE(cst.qtd_consultas, 0) <> 0 THEN (COALESCE(cst.tt_consulta, 0) + COALESCE(oe.tt_orc_executado, 0)) / cst.qtd_consultas
             ELSE NULL END AS tm_utilizacao
    FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
    LEFT JOIN (
        SELECT
            ag.id_paciente,
            COUNT(*) AS qtd_consultas,
            SUM(ag.valor) AS tt_consulta,
            AVG(ag.valor) AS tm_consulta
        FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
        WHERE 1=1
            AND ag.id_grupoprocedimento IN (1, 8) --'Bonificação', 'Retorno'?
            AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
        GROUP BY
            ag.id_paciente
    ) cst ON cst.id_paciente = ag.id_paciente
    LEFT JOIN (
        SELECT
            ag.id_paciente,
            COUNT(*) AS qtd_exam_proc,
            SUM(ag.valor) AS tt_exam_proc
        FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
        WHERE 1=1
            AND ag.id_grupoprocedimento IN (5, 4) --'Exames Laboratoriais' não tem essa classificação
            AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
        GROUP BY
            ag.id_paciente
    ) expc ON expc.id_paciente = ag.id_paciente
    LEFT JOIN (
        SELECT
        pp.pacienteid AS id_paciente,
        COUNT(*) AS qtd_orc_executado,
        SUM(pp.valor) AS tt_orc_executado
    FROM todos_data_lake_trusted_feegow.propostas pp
    WHERE 1=1
        AND pp.staid IN (5, 2)
    GROUP BY
        pp.pacienteid
	) oe ON oe.id_paciente = ag.id_paciente
    LEFT JOIN (
        SELECT
            pp.pacienteid AS id_paciente,
            COUNT(*) AS qtd_orc_nao_executado,
            SUM(pp.valor) AS tt_orc_nao_executado
        FROM todos_data_lake_trusted_feegow.propostas pp
        WHERE 1=1
            AND pp.staid NOT IN (5, 2)
        GROUP BY
            pp.pacienteid
    ) one ON one.id_paciente = ag.id_paciente
    LEFT JOIN (
        SELECT
            ag.id_paciente,
            COUNT(*) AS qtd_particular
        FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
        WHERE 1=1
            AND ag.id_grupoprocedimento IN (1, 8)
            AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
            AND ag.nm_tabela = 'PARTICULAR*'
        GROUP BY
            ag.id_paciente
    ) ptcl ON ptcl.id_paciente = ag.id_paciente
    LEFT JOIN (
        SELECT
            base.id_paciente,
            SUM(base.visitas) AS qtd_utilizacao
        FROM (
            SELECT
                ag.id_paciente,
                ag.dt_agendamento,
                COUNT(*) AS agregador,
                1 AS visitas
            FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
            WHERE 1=1
                AND ag.id_grupoprocedimento IN (1, 8, 5, 4) --'Exames Laboratoriais',
                AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
            GROUP BY
                ag.id_paciente, ag.dt_agendamento
        ) base
        GROUP BY
            base.id_paciente
    ) utlz ON utlz.id_paciente = ag.id_paciente
    LEFT JOIN (
        SELECT
            base.id_paciente,
            SUM(base.visitas) AS qtd_exam_proc
        FROM (
            SELECT
                ag.id_paciente,
                ag.dt_agendamento,
                COUNT(*) AS agregador,
                1 AS visitas
            FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
            WHERE 1=1
                AND ag.id_grupoprocedimento IN (5, 4) --'Exames Laboratoriais' não tem na agendamentos?
                AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
            GROUP BY
                ag.id_paciente, ag.dt_agendamento
        ) base
        GROUP BY
            base.id_paciente
    ) vexpc ON vexpc.id_paciente = ag.id_paciente
    WHERE 1=1
    AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
    GROUP BY
        ag.id_paciente,
        ag.cpf,
        utlz.qtd_utilizacao,
        cst.qtd_consultas,
        cst.tm_consulta,
        vexpc.qtd_exam_proc,
        expc.tt_exam_proc,
        one.qtd_orc_nao_executado,
        one.tt_orc_nao_executado,
        ptcl.qtd_particular,
        cst.tt_consulta,
        oe.tt_orc_executado,
        oe.qtd_orc_executado
),
canal_mais_utilizado AS (
    WITH canal_paciente AS (
        SELECT
            ag.id_paciente,
            COALESCE(ag.nm_canal, 'Não informado') nm_canal,
            COUNT(*) AS qtd_por_canal
        FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
        WHERE 1=1
        GROUP BY
            ag.id_paciente,
            ag.nm_canal
    ),
    max_canal_paciente AS (
        SELECT
            cp.id_paciente,
            MAX(cp.qtd_por_canal) AS max_qtd_canal
        FROM canal_paciente cp
        GROUP BY cp.id_paciente
    )
    SELECT DISTINCT
        cp.id_paciente,
        cp.nm_canal
    FROM canal_paciente cp
    LEFT JOIN max_canal_paciente mcp ON cp.id_paciente = mcp.id_paciente
    WHERE 1=1
    AND cp.qtd_por_canal = mcp.max_qtd_canal
),
regional_mais_utilizada AS (
        WITH regional_paciente AS (
        SELECT
            ag.id_paciente,
            ag.regional,
            COUNT(*) AS qtd_por_regional
        FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
        WHERE 1=1
        AND ag.regional IS NOT NULL
        AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
        GROUP BY
            ag.id_paciente,
            ag.regional
    ),
    max_regional_paciente AS (
        SELECT
            rp.id_paciente,
            MAX(rp.qtd_por_regional) AS max_qtd_regional
        FROM regional_paciente rp
        GROUP BY rp.id_paciente
    )
    SELECT DISTINCT
        rp.id_paciente,
        rp.regional
    FROM regional_paciente rp
    LEFT JOIN max_regional_paciente mrp ON rp.id_paciente = mrp.id_paciente
    WHERE 1=1
    AND rp.qtd_por_regional = mrp.max_qtd_regional
),
tabela_consulta AS (
        SELECT
        ag.id_paciente,
        ag.dt_agendamento,
        CASE WHEN ag.nm_tabela = 'Cartão de TODOS*' THEN 'Cartão de TODOS'
             WHEN ag.nm_tabela = 'PARTICULAR*' THEN 'Particular'
             ELSE 'Outros' END AS tabela_particular
    FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
    WHERE 1=1
    AND ag.id_grupoprocedimento IN (1, 8) --Consultas, Sessão
    AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
),
especialidade_mais_utilizada AS (
        WITH especialidade_paciente AS (
        SELECT
            ag.id_paciente,
            ag.nm_especialidade AS especialidade,
            COUNT(*) AS qtd_por_especialidade
        FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
        WHERE 1=1
        AND ag.nm_especialidade IS NOT NULL
        AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
        GROUP BY
            ag.id_paciente,
            ag.nm_especialidade
    ),
    ranked_especialidade_paciente AS (
        SELECT
            ep.id_paciente,
            ep.especialidade,
            ep.qtd_por_especialidade,
            ROW_NUMBER() OVER(PARTITION BY ep.id_paciente ORDER BY ep.qtd_por_especialidade DESC) AS row_num
        FROM especialidade_paciente ep
    ),
    filtered_especialidade_paciente AS (
        SELECT
            id_paciente,
            especialidade
        FROM ranked_especialidade_paciente
        WHERE row_num = 1
    )
    SELECT DISTINCT
        id_paciente,
        especialidade
    FROM filtered_especialidade_paciente
    ORDER BY id_paciente
),
tabela_mais_utilizada AS (
        WITH tabela_paciente AS (
        SELECT
            ag.id_paciente,
            CASE WHEN ag.nm_tabela = 'Cartão de TODOS*' THEN 'Cartão de TODOS'
             WHEN ag.nm_tabela = 'PARTICULAR*' THEN 'Particular'
             ELSE 'Outros' END AS tabela,
            COUNT(*) AS qtd_por_tabela
        FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
        WHERE 1=1
        AND ag.nm_tabela IS NOT NULL
        AND ag.id_status IN (2, 3, 4, 5, 33, 200, 201, 202, 203, 204, 205, 206, 207)
        GROUP BY
            ag.id_paciente,
            ag.nm_tabela
    ),
    ranked_tabela_paciente AS (
        SELECT
            ep.id_paciente,
            ep.tabela,
            ep.qtd_por_tabela,
            ROW_NUMBER() OVER(PARTITION BY ep.id_paciente ORDER BY ep.qtd_por_tabela DESC) AS row_num
        FROM tabela_paciente ep
    ),
    filtered_tabela_paciente AS (
        SELECT
            id_paciente,
            tabela
        FROM ranked_tabela_paciente
        WHERE row_num = 1
    )
    SELECT DISTINCT
        id_paciente,
        tabela
    FROM filtered_tabela_paciente
    ORDER BY id_paciente
),
unidade_mais_utilizada AS (
    WITH unidade_paciente AS (
        SELECT
            ag.id_paciente,
            ag.id_unidade,
            ag.unidade,
            COUNT(*) AS qtd_unidade
        FROM pdgt_amorsaude_operacoes.fl_agendamentos ag
        WHERE 1=1
        GROUP BY
            ag.id_paciente,
            ag.id_unidade,
            ag.unidade
    ),
    max_unidade_paciente AS (
        SELECT
            up.id_paciente,
            MAX(up.qtd_unidade) AS max_qtd_unidade
        FROM unidade_paciente up
        GROUP BY 
        	up.id_paciente
    )
    SELECT DISTINCT
        up.id_paciente,
        up.id_unidade,
        up.unidade
    FROM unidade_paciente up
    LEFT JOIN max_unidade_paciente mup ON up.id_paciente = mup.id_paciente
    WHERE 1=1
    AND up.qtd_unidade = mup.max_qtd_unidade
)
SELECT DISTINCT
    ap.*,
    minpg.tabela_particular AS primeiro_pg,
    CASE WHEN ap.qtd_consultas = ap.qtd_particular AND ap.qtd_consultas <> 0 THEN 1
         ELSE 0 END AS pac_particular,
    CASE WHEN ap.dias_ult_utilizacao > 365 THEN 1
         WHEN ap.dias_ult_utilizacao BETWEEN 181 AND 365 THEN 2
         WHEN ap.dias_ult_utilizacao BETWEEN 91 AND 180 THEN 3
         WHEN ap.dias_ult_utilizacao BETWEEN 46 AND 90 THEN 4
         WHEN ap.dias_ult_utilizacao <= 45 THEN 5
         ELSE NULL END AS score_r,
    CASE WHEN ap.qtd_consultas = 1 THEN 1
         WHEN ap.qtd_consultas = 2 THEN 2
         WHEN ap.qtd_consultas = 3 THEN 3
         WHEN ap.qtd_consultas BETWEEN 4 AND 5 THEN 4
         WHEN ap.qtd_consultas > 5 THEN 5
         ELSE NULL END AS score_f,
    CASE WHEN ap.tm_utilizacao <= 28 THEN 1
         WHEN ap.tm_utilizacao BETWEEN 28.01 AND 56 THEN 2
         WHEN ap.tm_utilizacao BETWEEN 56.01 AND 84 THEN 3
         WHEN ap.tm_utilizacao BETWEEN 84.01 AND 140 THEN 4
         WHEN ap.tm_utilizacao > 140 THEN 5
         ELSE NULL END AS score_v,
    COALESCE(cmu.nm_canal, 'Não informado') AS canal_mais_utilizado,
    COALESCE(UPPER(pe.estado), 'Não informado') AS estado,
    UPPER(pe.cidade) AS cidade_paciente,
    COALESCE(UPPER(pe.bairro), 'Não informado') AS bairro,
    UPPER(CONCAT(regexp_replace(regexp_replace(pe.logradouro, '[0-9]', ''), ', ', ''), ', ', pe.numero, ', ', pe.bairro, ', ', pe.cidade, ', ', pe.estado)) endereco_paciente,
    DATE_DIFF('year', pc.nascimento, current_date) AS idade,
    sx.nomesexo AS sexo,
    rmu.regional,
    emu.especialidade,
    tmu.tabela,
    umu.unidade,
    UPPER(CONCAT(un.endereco, ', ', un.numero, ', ', un.bairro, ', ', un.cidade, ', ', un.estado)) AS endereco_unidade,
    UPPER(un.cidade) AS cidade_unidade
FROM agg_pacientes ap
LEFT JOIN canal_mais_utilizado cmu ON cmu.id_paciente = ap.id_paciente
LEFT JOIN regional_mais_utilizada rmu ON rmu.id_paciente = ap.id_paciente
LEFT JOIN especialidade_mais_utilizada emu ON emu.id_paciente = ap.id_paciente
LEFT JOIN tabela_consulta minpg ON minpg.id_paciente = ap.id_paciente AND minpg.dt_agendamento = ap.min_dt_ag
LEFT JOIN tabela_mais_utilizada tmu ON tmu.id_paciente = ap.id_paciente
LEFT JOIN unidade_mais_utilizada umu ON umu.id_paciente = ap.id_paciente
LEFT JOIN todos_data_lake_trusted_feegow.paciente_endereco pe ON pe.paciente_id = ap.id_paciente
LEFT JOIN todos_data_lake_trusted_feegow.pacientes pc ON pc.id = ap.id_paciente
LEFT JOIN todos_data_lake_trusted_feegow.sexo sx ON sx.id = pc.sexo
LEFT JOIN todos_data_lake_trusted_feegow.unidades un ON umu.id_unidade = un.id 
WHERE 1=1
AND ap.cpf IS NOT NULL
AND pc.nascimento IS NOT NULL
AND pe.logradouro IS NOT NULL 
AND DATE_DIFF('year', pc.nascimento, current_date) BETWEEN 0 AND 100
--AND LENGTH(pe.estado) = 2
AND pe.estado IN ('AC', 'AL', 'AM', 'AP', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG', 'MS', 'MT', 'PA', 'PB', 'PE', 'PI', 
'PR', 'RJ', 'RN', 'RO', 'RR', 'RS', 'SC', 'SE', 'SP', 'TO')
--AND RANDOM() <= 0.003
LIMIT 1000
--)
"""

In [8]:
endereco_df = execute_athena_query(query_endereco)

In [9]:
endereco_df

Unnamed: 0,id_paciente,cpf,min_dt_ag,max_dt_ag,anos_utilizacao,meses_utilizacao,dias_ult_utilizacao,qtd_utilizacao,qtd_consultas,qtd_exam_proc,...,bairro,endereco_paciente,idade,sexo,regional,especialidade,tabela,unidade,endereco_unidade,cidade_unidade
0,64249911,00430173393,2023-01-17,2023-09-25,0,8,8,1,1,3,...,CENTRO,"RUA ERNESTO ALVES, 1039, CENTRO, SANTA CRUZ DO...",38,Masculino,NE2,Psicologia,Cartão de TODOS,AmorSaúde São José de Ribamar,"ESTRADA DE RIBAMAR , 21 A, MAIOBÃO BAIRRO TIJU...",SÃO JOSÉ DE RIBAMAR
1,64249911,00430173393,2023-01-17,2023-09-25,0,8,8,1,1,3,...,CENTRO,"RUA ERNESTO ALVES, 1039, CENTRO, SANTA CRUZ DO...",38,Masculino,Sul,Psicologia,Cartão de TODOS,AmorSaúde São José de Ribamar,"ESTRADA DE RIBAMAR , 21 A, MAIOBÃO BAIRRO TIJU...",SÃO JOSÉ DE RIBAMAR
2,17195271,10852072422,2023-03-31,2023-08-22,0,6,42,1,1,1,...,ÁGUA FRIA,"VILA BEIJA-FLOR, 89, ÁGUA FRIA, RECIFE, PE",23,Feminino,NE1,Clinica Médica,Cartão de TODOS,AmorSaúde Recife Encruzilhada,"RUA JOSÉ DE SÁ CARNEIRO, 60, ENCRUZILHADA, REC...",RECIFE
3,18112378,54470579904,2021-05-26,2023-09-28,2,28,5,14,15,2,...,XAXIM,"RUA RAUL DE AZEVEDO MACEDO, 180, XAXIM, CURITI...",58,Feminino,Sul,Clinica Médica,Cartão de TODOS,AmorSaúde Curitiba Pinheirinho,"AVENIDA WINSTON CHURCHILL, 309, CAPÃO RASO, CU...",CURITIBA
4,59811170,11894987900,2021-11-12,2021-11-12,1,22,690,0,0,0,...,PILARZINHO,"RUA VINTE E CINCO DE ABRIL, 335, PILARZINHO, C...",22,,Sul,Clinica Médica,Cartão de TODOS,AmorSaúde Curitiba Centro,"RUA VINTE E QUATRO DE MAIO, 640, REBOUÇAS, CUR...",CURITIBA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,58839330,08163848430,2021-09-09,2023-03-31,2,24,186,5,5,2,...,CATOLÉ,"RUA CRISTO REDENTOR, 58, CATOLÉ, CAMPINA GRAND...",34,Masculino,NE2,Clinica Médica,Cartão de TODOS,AmorSaúde Campina Grande,"RUA TAVARES CAVALCANTE, 59, CENTRO, CAMPINA GR...",CAMPINA GRANDE
996,14901043,27588797860,2021-08-20,2023-02-03,2,25,242,12,16,3,...,CAXANGÁ,"RUA MIGUEL DO NASCIMENTO, 139, CAXANGÁ, SUZANO...",45,Feminino,SP CAV,Nutrição,Cartão de TODOS,AmorSaúde Suzano,"RUA CAMPOS SALES, 504, CENTRO, SUZANO, SP",SUZANO
997,12196760,50267143826,2019-07-04,2022-07-07,4,50,453,8,8,0,...,VILA EMA,"RUA TASSO, 74, VILA EMA, CARAPICUÍBA, SP",15,Masculino,SP CAV,Psicologia,Cartão de TODOS,AmorSaúde Carapicuíba,"AVENIDA TAMARA, 107, CENTRO, CARAPICUÍBA, SP",CARAPICUÍBA
998,59595976,84654660844,2021-10-27,2023-05-18,1,23,138,12,12,2,...,CUTIANOS,"RUA DJALMA ALMEIDA RAMALHO, 52, CUTIANOS, PIED...",78,Feminino,SP Interior,Gastroenterologia,Cartão de TODOS,AmorSaúde Votorantim,"RUA MONTE ALEGRE, 218, CENTRO, VOTORANTIM, SP",VOTORANTIM


In [10]:
geolocator = Nominatim(user_agent="my_geocoder")

In [11]:
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=1, error_wait_seconds=1, swallow_exceptions=True, return_value_on_exception=None)

total_rows = len(endereco_df)

# Function to calculate the distance between two addresses
def calcular_distancia(row, row_number):
    try:
        location1 = geocode(row['endereco_paciente'])
        location2 = geocode(row['endereco_unidade'])

        if location1 and location2:
            coords1 = (location1.latitude, location1.longitude)
            coords2 = (location2.latitude, location2.longitude)
            distancia = geodesic(coords1, coords2).kilometers
            percentage = (row.name + 1) / total_rows * 100  # Adding 1 to start at 1 instead of 0
            print(f"Processing: {percentage:.2f}% completed")
            return distancia
        else:
            # If either location is None, return None to skip the row
            return None
        
    except Exception as e:
        print(f"An error occurred on row {row_number}: {e}")
        return None

# Apply the function to calculate the distance and create a 'Distancia' column
endereco_df['distancia'] = endereco_df.apply(lambda row: calcular_distancia(row, row.name), axis=1)

# Examine the DataFrame with the distance column
endereco_df

Processing: 0.40% completed
Processing: 0.50% completed
Processing: 0.60% completed
Processing: 1.20% completed
Processing: 1.30% completed
Processing: 1.40% completed
Processing: 1.70% completed
Processing: 1.90% completed
Processing: 2.70% completed
Processing: 2.90% completed
Processing: 3.00% completed
Processing: 3.10% completed
Processing: 3.30% completed
Processing: 3.70% completed
Processing: 3.80% completed
Processing: 3.90% completed
Processing: 4.00% completed
Processing: 4.10% completed
Processing: 4.20% completed
Processing: 4.30% completed
Processing: 4.50% completed
Processing: 4.60% completed
Processing: 4.70% completed
Processing: 4.80% completed
Processing: 4.90% completed
Processing: 5.10% completed
Processing: 5.40% completed
Processing: 5.80% completed
Processing: 5.90% completed
Processing: 6.20% completed
Processing: 6.30% completed
Processing: 6.40% completed
Processing: 6.60% completed
Processing: 7.00% completed
Processing: 7.10% completed
Processing: 7.40% co

Unnamed: 0,id_paciente,cpf,min_dt_ag,max_dt_ag,anos_utilizacao,meses_utilizacao,dias_ult_utilizacao,qtd_utilizacao,qtd_consultas,qtd_exam_proc,...,endereco_paciente,idade,sexo,regional,especialidade,tabela,unidade,endereco_unidade,cidade_unidade,distancia
0,64249911,00430173393,2023-01-17,2023-09-25,0,8,8,1,1,3,...,"RUA ERNESTO ALVES, 1039, CENTRO, SANTA CRUZ DO...",38,Masculino,NE2,Psicologia,Cartão de TODOS,AmorSaúde São José de Ribamar,"ESTRADA DE RIBAMAR , 21 A, MAIOBÃO BAIRRO TIJU...",SÃO JOSÉ DE RIBAMAR,
1,64249911,00430173393,2023-01-17,2023-09-25,0,8,8,1,1,3,...,"RUA ERNESTO ALVES, 1039, CENTRO, SANTA CRUZ DO...",38,Masculino,Sul,Psicologia,Cartão de TODOS,AmorSaúde São José de Ribamar,"ESTRADA DE RIBAMAR , 21 A, MAIOBÃO BAIRRO TIJU...",SÃO JOSÉ DE RIBAMAR,
2,17195271,10852072422,2023-03-31,2023-08-22,0,6,42,1,1,1,...,"VILA BEIJA-FLOR, 89, ÁGUA FRIA, RECIFE, PE",23,Feminino,NE1,Clinica Médica,Cartão de TODOS,AmorSaúde Recife Encruzilhada,"RUA JOSÉ DE SÁ CARNEIRO, 60, ENCRUZILHADA, REC...",RECIFE,
3,18112378,54470579904,2021-05-26,2023-09-28,2,28,5,14,15,2,...,"RUA RAUL DE AZEVEDO MACEDO, 180, XAXIM, CURITI...",58,Feminino,Sul,Clinica Médica,Cartão de TODOS,AmorSaúde Curitiba Pinheirinho,"AVENIDA WINSTON CHURCHILL, 309, CAPÃO RASO, CU...",CURITIBA,2.886319
4,59811170,11894987900,2021-11-12,2021-11-12,1,22,690,0,0,0,...,"RUA VINTE E CINCO DE ABRIL, 335, PILARZINHO, C...",22,,Sul,Clinica Médica,Cartão de TODOS,AmorSaúde Curitiba Centro,"RUA VINTE E QUATRO DE MAIO, 640, REBOUÇAS, CUR...",CURITIBA,6.213884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,58839330,08163848430,2021-09-09,2023-03-31,2,24,186,5,5,2,...,"RUA CRISTO REDENTOR, 58, CATOLÉ, CAMPINA GRAND...",34,Masculino,NE2,Clinica Médica,Cartão de TODOS,AmorSaúde Campina Grande,"RUA TAVARES CAVALCANTE, 59, CENTRO, CAMPINA GR...",CAMPINA GRANDE,3.504572
996,14901043,27588797860,2021-08-20,2023-02-03,2,25,242,12,16,3,...,"RUA MIGUEL DO NASCIMENTO, 139, CAXANGÁ, SUZANO...",45,Feminino,SP CAV,Nutrição,Cartão de TODOS,AmorSaúde Suzano,"RUA CAMPOS SALES, 504, CENTRO, SUZANO, SP",SUZANO,3.776854
997,12196760,50267143826,2019-07-04,2022-07-07,4,50,453,8,8,0,...,"RUA TASSO, 74, VILA EMA, CARAPICUÍBA, SP",15,Masculino,SP CAV,Psicologia,Cartão de TODOS,AmorSaúde Carapicuíba,"AVENIDA TAMARA, 107, CENTRO, CARAPICUÍBA, SP",CARAPICUÍBA,
998,59595976,84654660844,2021-10-27,2023-05-18,1,23,138,12,12,2,...,"RUA DJALMA ALMEIDA RAMALHO, 52, CUTIANOS, PIED...",78,Feminino,SP Interior,Gastroenterologia,Cartão de TODOS,AmorSaúde Votorantim,"RUA MONTE ALEGRE, 218, CENTRO, VOTORANTIM, SP",VOTORANTIM,


In [12]:
endereco_df.isna().mean()*100

id_paciente               0.0
cpf                       0.0
min_dt_ag                 0.0
max_dt_ag                 0.0
anos_utilizacao           0.0
meses_utilizacao          0.0
dias_ult_utilizacao       0.0
qtd_utilizacao            0.0
qtd_consultas             0.0
qtd_exam_proc             0.0
qtd_particular            0.0
tt_consulta               0.0
tm_consulta               0.0
tt_exam_proc              0.0
qtd_orc_nao_executado     0.0
tt_orc_nao_executado      0.0
tm_utilizacao             0.2
primeiro_pg               2.2
pac_particular            0.0
score_r                   0.0
score_f                   0.2
score_v                   0.2
canal_mais_utilizado      0.0
estado                    0.0
cidade_paciente           0.0
bairro                    0.0
endereco_paciente         0.0
idade                     0.0
sexo                      2.6
regional                  0.0
especialidade             0.0
tabela                    0.0
unidade                   0.0
endereco_u

In [13]:
endereco_df.to_excel('distancia_paciente_validacao.xlsx', index=False, sheet_name='base')