In [1]:
import pandas as pd
import numpy as np
import re
from unidecode import unidecode
import os
import datetime
from tqdm import tqdm
import glob

def read_file(file):
    """
    read file line by line
    """
    lines = [line.rstrip() for line in open(file, 'r', encoding='ISO-8859-15')]
    return lines

def find_idx(lines, substring):
    """
    find the line indexes containing a substring
    """
    idx = []
    count = 0
    for line in lines:
        if substring in line:
            idx.append(count)
        count += 1
    return idx

def keep_cols_by_name(df, substring):
    df = df.filter(regex=substring)
    return df


def unstack_double_header(df, col_index):
    """
    transform df with double headers into single header
    """

    # Unstack two headers
    cols = df.iloc[col_index][1::].unique()
    cols = (list(set([col.strip() for col in cols])))

    result = pd.DataFrame()
    for col in tqdm(cols):

        # get the indexes of the desired columns
        idx = np.where(df.iloc[0] == col)[0]
        # insert first column (key)
        idx = np.insert(idx, 0, 0, axis=0)

        # filter just the desired columns and removes lines from headers
        df_aux = df.loc[2::,idx]
        df_aux.columns = df.loc[1, idx]

        df_aux["new_col"] = col
        result = pd.concat([result, df_aux])

    return result


def dont_unstack_double_header(df):
    df.iloc[:,0] = df.iloc[:,0].replace(r'^\s*$', np.nan, regex=True)
    df.iloc[:,0] = df.iloc[:,0].ffill()

    new_header = df.iloc[0] #grab the first row for the header
    df = df[1:] #take the data less the header row
    df.columns = new_header #set the header row as the df header
    return df


def stack_same_tables_in_same_file(idxs, lines):
    result = pd.DataFrame()
    for i in range(1, len(idxs)):
        fst_idx=idxs[i-1]+2
        last_idx=idxs[i]-1

        df = pd.DataFrame([line.split(";") for line in lines[fst_idx:last_idx]])

        #aux = unstack_double_header(df, 0)
        aux = dont_unstack_double_header(df)        
        aux["table"] = lines[fst_idx-2]
        result = pd.concat([result, aux])

        print(f"\t Time: {datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')} - Iteraction: {i} - First idx = {fst_idx} - Last idx = {last_idx} - Shape before unstack: {df.shape} - Shape after unstack: {aux.shape} - Shape after append: {result.shape} ")
    return result

def df_janitor(df):

    # Correct new columns added names
    df = df.rename({"table":"Sexo"}, axis='columns')

    # adjust content of the lines of the column Faixa Etaria
    #df["Faixa Etaria"] = [re.sub(r'^\s*\d+:', '', re.sub(r'\.\d+\s*$', '', s)) for s in df["Faixa Etaria"]]

    # adjust content of the lines of the column Sexo
    df["Sexo"] = [re.sub("Sexo Trabalhador = ", '', s) for s in df["Sexo"]]
    df["Sexo"] = [re.sub(r'^\s*\d+:', '', re.sub(r'\.\d+\s*$', '', s)) for s in df["Sexo"]]

    # remove unwanted lines
    to_remove = ['{ñ class}', 'Total', '', 'Seleções vigentes', 'Variável', 'Ano', 'Vínculo Ativo 31/12']
    df = df[~df.iloc[:,0].isin(to_remove)]
    df = df[~df.iloc[:,1].isin(to_remove)]
    #df = df[~df["Faixa Etaria"].isin(to_remove)]
    df = df[~df["Sexo"].isin(to_remove)]

    # transform string nan to effective nan
    df = df.replace("NaN", np.nan)

    # adjust columns names
    df.columns = [re.sub(r'^\s*\d+:', '', re.sub(r'\.\d+\s*$', '', coluna)) for coluna in df.columns]

    # adjust line names
    df.iloc[:,0] = df.iloc[:,0].str.split(':').str[1]
    df.iloc[:,1] = df.iloc[:,1].str.split(':').str[1]

    #apagar todas as colunas com total no nome
    df = df.drop(df.filter(like='Total').columns, axis=1)
    df = df.drop(df.filter(like='{ñ class}').columns, axis=1)

    # remove special chars
    df.columns = [re.sub(r'^\s*\d+:', '', re.sub(r'\.\d+\s*$', '', coluna)) for coluna in df.columns]
    df.columns = [re.sub(r'[^a-zA-Z0-9]+', '_', unidecode(coluna.lower())) for coluna in df.columns]


    return df

def sexoXcboXcbo_grupo(file):
    lines = read_file(file)
    substring = "Sexo Trabalhador = "
    idxs = find_idx(lines, substring)
    # adding the last line of the file (+1 to be ok with the logic below)
    idxs.append(len(lines)+1)

    # read the file and stack the tables
    result = stack_same_tables_in_same_file(idxs, lines)

    # clean the output
    df = df_janitor(result)

    df["arquivo"] = os.path.split(file)[1]

    return df

In [2]:

path = os.getcwd()
extension = 'csv'
os.chdir(path)
files_csv = glob.glob('*.{}'.format(extension))

#path = '/Users/bolsolui/Documents/personal/MECAI/dados/RAIS-video/Rais_vinculo/escolaridade_sexo_salario'
#extension = 'csv'
#os.chdir(path)
#files_csv = glob.glob('*.{}'.format(extension))

df_final = pd.DataFrame()
for i in files_csv:
    aux = sexoXcboXcbo_grupo(i)
    df_final = pd.concat([df_final, aux])
    print(f"Time: {datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')} - File {i}")

df_final.to_csv("sexoXcboXcbo_grupo.csv")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aux["table"] = lines[fst_idx-2]


	 Time: 20240217 11:45:55 - Iteraction: 1 - First idx = 2 - Last idx = 321 - Shape before unstack: (319, 609) - Shape after unstack: (318, 610) - Shape after append: (318, 610) 
	 Time: 20240217 11:45:55 - Iteraction: 2 - First idx = 324 - Last idx = 643 - Shape before unstack: (319, 609) - Shape after unstack: (318, 610) - Shape after append: (636, 610) 
	 Time: 20240217 11:45:55 - Iteraction: 3 - First idx = 646 - Last idx = 971 - Shape before unstack: (325, 609) - Shape after unstack: (324, 610) - Shape after append: (960, 610) 
Time: 20240217 11:45:55 - File sexoXcboXcbo_grupoX2015.csv
	 Time: 20240217 11:45:55 - Iteraction: 1 - First idx = 2 - Last idx = 321 - Shape before unstack: (319, 609) - Shape after unstack: (318, 610) - Shape after append: (318, 610) 
	 Time: 20240217 11:45:55 - Iteraction: 2 - First idx = 324 - Last idx = 643 - Shape before unstack: (319, 609) - Shape after unstack: (318, 610) - Shape after append: (636, 610) 
	 Time: 20240217 11:45:55 - Iteraction: 3 - F