In [1]:
import pandas as pd
from sqlalchemy.engine import URL
from sqlalchemy import create_engine, text
import boto3
import ast

## Connect to DataBase

In [2]:
secret_name = "database_tcepb"
region_name = "sa-east-1"

# Create a Secrets Manager client
session = boto3.session.Session()
client = session.client(
    service_name="secretsmanager",
    region_name=region_name
)

get_secret_value_response = client.get_secret_value(
    SecretId=secret_name
)
SERVER_TRIBUNAL = ast.literal_eval(get_secret_value_response["SecretString"])["host"] + ", " + ast.literal_eval(get_secret_value_response["SecretString"])["port"]
DB_TRIBUNAL = ast.literal_eval(get_secret_value_response["SecretString"])["dbname"]
USERNAME = ast.literal_eval(get_secret_value_response["SecretString"])["username"]
PASSWORD = ast.literal_eval(get_secret_value_response["SecretString"])["password"]

In [3]:
string_connection = "DRIVER={SQL Server};" + f"SERVER={SERVER_TRIBUNAL};DATABASE={DB_TRIBUNAL};UID={USERNAME};PWD={PASSWORD}"
url = URL.create("mssql+pyodbc", query={"odbc_connect": string_connection})
engine = create_engine(url) # connection engine

## Read query with filters

In [4]:
with open("main_query.sql", "r") as f:
    main_query = f.read()

with open("filter_medicine_with_cnae&anvisa.sql", "r") as f:
    filter_medicine_with_cnae_and_anvisa = f.read()
with open("filter_medicine_with_cnae&NOTanvisa.sql", "r") as f:
    filter_medicine_with_cnae_and_NOTanvisa = f.read()
with open("filter_medicine_with_NOTcnae&anvisa.sql", "r") as f:
    filter_medicine_with_NOTcnae_and_anvisa = f.read()
with open("filter_medicine_with_NOTcnae&NOTanvisa.sql", "r") as f:
    filter_medicine_with_NOTcnae_and_NOTanvisa = f.read()

with open("filter_hospital_material_with_cnae.sql", "r") as f:
    filter_hospital_material_with_cnae = f.read()
with open("filter_hospital_material_with_NOTcnae.sql", "r") as f:
    filter_hospital_material_with_NOTcnae = f.read()

with open("filter_others_with_cnae.sql", "r") as f:
    filter_others_with_cnae = f.read()
with open("filter_others_with_NOTcnae.sql", "r") as f:
    filter_others_with_NOTcnae = f.read()

dict_filters = { # organize filters
    "medicine": {
        "cnae&anvisa": filter_medicine_with_cnae_and_anvisa,
        "cnae&NOTanvisa": filter_medicine_with_cnae_and_NOTanvisa,
        "NOTcnae&anvisa": filter_medicine_with_NOTcnae_and_anvisa,
        "NOTcnae&NOTanvisa": filter_medicine_with_NOTcnae_and_NOTanvisa
    },
    "hospital_material": {
        "cnae": filter_hospital_material_with_cnae,
        "NOTcnae": filter_hospital_material_with_NOTcnae
    },
    "others": {
        "cnae": filter_others_with_cnae,
        "NOTcnae": filter_others_with_NOTcnae
    }
}

In [5]:
dtype = {
    "id_produto" : "int32",
    "codigo_cfop" : "string",
    "codigo_cest" : "string",
    "codigo_ncm" : "string",
    "codigo_ean" : "string",
    "descricao" : "string",
    "unidade" : "string",
    "id_medicamento" : "int32",
    "cod_anvisa" : "string",
    "id_combustivel" : "int32",
    "codigo_anp" : "string",
    "cnpj" : "string",
    "razao_social" : "string",
    "nome_da_atividade_economica" : "string",
    "cnae_fiscal" : "int32",
    "cnae_secundaria" : "string"
}

## Run Queries

In [6]:
with engine.begin() as conn: # get query results into pandas DataFrame
    df_medicine_with_cnae_and_anvisa = pd.read_sql_query(sql=text(main_query + dict_filters["medicine"]["cnae&anvisa"]), con=conn).astype(dtype)
    df_medicine_with_cnae_and_NOTanvisa = pd.read_sql_query(sql=text(main_query + dict_filters["medicine"]["cnae&NOTanvisa"]), con=conn).astype(dtype)
    df_medicine_with_NOTcnae_and_anvisa = pd.read_sql_query(sql=text(main_query + dict_filters["medicine"]["NOTcnae&anvisa"]), con=conn).astype(dtype)
    df_medicine_with_NOTcnae_and_NOTanvisa = pd.read_sql_query(sql=text(main_query + dict_filters["medicine"]["NOTcnae&NOTanvisa"]), con=conn).astype(dtype)

    df_hospital_material_with_cnae = pd.read_sql_query(sql=text(main_query + dict_filters["hospital_material"]["cnae"]), con=conn).astype(dtype)
    df_hospital_material_with_NOTcnae = pd.read_sql_query(sql=text(main_query + dict_filters["hospital_material"]["NOTcnae"]), con=conn).astype(dtype)

    df_others_with_cnae = pd.read_sql_query(sql=text(main_query + dict_filters["others"]["cnae"]), con=conn).astype(dtype)
    df_others_with_NOTcnae = pd.read_sql_query(sql=text(main_query + dict_filters["others"]["NOTcnae"]), con=conn).astype(dtype)

In [7]:
print(f"Medicine with cnae and anvisa -> dataframe shape: {df_medicine_with_cnae_and_anvisa.shape}")
print(f"Medicine with cnae and without anvisa -> dataframe shape: {df_medicine_with_cnae_and_NOTanvisa.shape}")
print(f"Medicine without cnae and with anvisa -> dataframe shape: {df_medicine_with_NOTcnae_and_anvisa.shape}")
print(f"Medicine without cnae and anvisa -> dataframe shape: {df_medicine_with_NOTcnae_and_NOTanvisa.shape}\n")

print(f"Hospital material with cnae -> dataframe shape: {df_hospital_material_with_cnae.shape}")
print(f"Hospital material without cnae -> dataframe shape: {df_hospital_material_with_NOTcnae.shape}\n")

print(f"Others with cnae -> dataframe shape: {df_others_with_cnae.shape}")
print(f"Others without cnae -> dataframe shape: {df_others_with_NOTcnae.shape}")

Medicine with cnae and anvisa -> dataframe shape: (500000, 16)
Medicine with cnae and without anvisa -> dataframe shape: (500000, 16)
Medicine without cnae and with anvisa -> dataframe shape: (840, 16)
Medicine without cnae and anvisa -> dataframe shape: (11688, 16)

Hospital material with cnae -> dataframe shape: (500000, 16)
Hospital material without cnae -> dataframe shape: (141967, 16)

Others with cnae -> dataframe shape: (500000, 16)
Others without cnae -> dataframe shape: (500000, 16)


## Getting differents CNPJs samples

In [8]:
def get_products_from_all_cnpjs(df: pd.DataFrame, limit: int) -> pd.DataFrame:
    """Get first product from all cnpjs, then second, then third ...
    Return DataFrame with the indicated limit."""
    if df.shape[0] <= limit:
        return df
    else:
        cnpjs = df["cnpj"].unique() # return a list with unique values
        indexes = []
        count = 0
        b = True
        while b:
            for cnpj in cnpjs:
                i = df.loc[df["cnpj"] == cnpj].index # get all row indexes from cnpj
                if count < len(i): # if is in range for that cnpj then append / evoide index error
                    indexes.append(i[count]) # get first product from cnpj, then second, then third ...

                if len(indexes) == limit: # end loops when hit determined limit of rows
                    b = False # break while loop after for loop
                    break # break for loop
            count += 1

        return df.loc[df.index[indexes]]

In [9]:
df_medicine_with_cnae_and_anvisa = get_products_from_all_cnpjs(df_medicine_with_cnae_and_anvisa, 2500)
print(f"Medicine with cnae and anvisa shape after filter from cnpjs: {df_medicine_with_cnae_and_anvisa.shape}")
df_medicine_with_cnae_and_NOTanvisa = get_products_from_all_cnpjs(df_medicine_with_cnae_and_NOTanvisa, 4194)
print(f"Medicine with cnae and without anvisa shape after filter from cnpjs: {df_medicine_with_cnae_and_NOTanvisa.shape}")
df_medicine_with_NOTcnae_and_anvisa = get_products_from_all_cnpjs(df_medicine_with_NOTcnae_and_anvisa, 2500)
print(f"Medicine without cnae and with anvisa shape after filter from cnpjs: {df_medicine_with_NOTcnae_and_anvisa.shape}")
df_medicine_with_NOTcnae_and_NOTanvisa = get_products_from_all_cnpjs(df_medicine_with_NOTcnae_and_NOTanvisa, 2500)
print(f"Medicine without cnae and anvisa shape after filter from cnpjs: {df_medicine_with_NOTcnae_and_NOTanvisa.shape}\n")

df_hospital_material_with_cnae = get_products_from_all_cnpjs(df_hospital_material_with_cnae, 5000)
print(f"Hospital material with cnae shape after filter from cnpjs: {df_hospital_material_with_cnae.shape}")
df_hospital_material_with_NOTcnae = get_products_from_all_cnpjs(df_hospital_material_with_NOTcnae, 5000)
print(f"Hospital material without cnae shape after filter from cnpjs: {df_hospital_material_with_NOTcnae.shape}\n")

df_others_with_cnae = get_products_from_all_cnpjs(df_others_with_cnae, 5000)
print(f"Others with cnae shape after filter from cnpjs: {df_others_with_cnae.shape}")
df_others_with_NOTcnae = get_products_from_all_cnpjs(df_others_with_NOTcnae, 5000)
print(f"Others without cnae shape after filter from cnpjs: {df_others_with_NOTcnae.shape}")

Medicine with cnae and anvisa shape after filter from cnpjs: (2500, 16)
Medicine with cnae and without anvisa shape after filter from cnpjs: (4194, 16)
Medicine without cnae and with anvisa shape after filter from cnpjs: (840, 16)
Medicine without cnae and anvisa shape after filter from cnpjs: (2500, 16)

Hospital material with cnae shape after filter from cnpjs: (5000, 16)
Hospital material without cnae shape after filter from cnpjs: (5000, 16)

Others with cnae shape after filter from cnpjs: (5000, 16)
Others without cnae shape after filter from cnpjs: (5000, 16)


In [10]:
len(df_medicine_with_cnae_and_anvisa["cnae_fiscal"].unique()), len(df_medicine_with_cnae_and_NOTanvisa["cnae_fiscal"].unique())

(20, 57)

## Saving data

* 0 -> Medicine;
* 1 -> Hospital Material;
* 2 -> Others.

In [11]:
df_medicine_with_cnae_and_anvisa["tipo_produto"] = 0
df_medicine_with_cnae_and_NOTanvisa["tipo_produto"] = 0
df_medicine_with_NOTcnae_and_anvisa["tipo_produto"] = 0
df_medicine_with_NOTcnae_and_NOTanvisa["tipo_produto"] = 0

df_hospital_material_with_cnae["tipo_produto"] = 1
df_hospital_material_with_NOTcnae["tipo_produto"] = 1

df_others_with_cnae["tipo_produto"] = 2
df_others_with_NOTcnae["tipo_produto"] = 2

In [12]:
df_merged = pd.concat([df_medicine_with_cnae_and_anvisa, df_medicine_with_cnae_and_NOTanvisa, df_medicine_with_NOTcnae_and_anvisa, df_medicine_with_NOTcnae_and_NOTanvisa, \
                    df_hospital_material_with_cnae, df_hospital_material_with_NOTcnae, df_others_with_cnae, df_others_with_NOTcnae], ignore_index=True, axis=0).drop_duplicates()
                                                                                                                                            # axis=0 for concatenate rows
print(f"Shape full DataFrame: {df_merged.shape}")

Shape full DataFrame: (30034, 17)


In [13]:
df_merged.isnull().sum() # show number of null values in each column

id_produto                         0
codigo_cfop                        0
codigo_cest                    18209
codigo_ncm                        23
codigo_ean                      3496
descricao                          0
unidade                            0
id_medicamento                     0
cod_anvisa                     26424
id_combustivel                     0
codigo_anp                     29483
cnpj                               0
razao_social                       0
nome_da_atividade_economica       56
cnae_fiscal                        0
cnae_secundaria                    0
tipo_produto                       0
dtype: int64

In [14]:
df_merged.dtypes # show data type of each column

id_produto                      int32
codigo_cfop                    string
codigo_cest                    string
codigo_ncm                     string
codigo_ean                     string
descricao                      string
unidade                        string
id_medicamento                  int32
cod_anvisa                     string
id_combustivel                  int32
codigo_anp                     string
cnpj                           string
razao_social                   string
nome_da_atividade_economica    string
cnae_fiscal                     int32
cnae_secundaria                string
tipo_produto                    int64
dtype: object

In [15]:
df_merged = df_merged.sort_values(by=["id_produto"], ascending=True) # order by id_produto asc
df_merged.to_csv("data.csv", sep=";", index=False)