In [9]:
import re
import os
import glob
import numpy as np
import pandas as pd
from unidecode import unidecode
import sys
import os


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))



In [10]:
from utils import remove_invalid_rows, clean_column_names,initial_cleaning, create_gender_df,final_adjustments

In [11]:
# set pandas default rows/columns for better visualization
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [17]:
"""
In this section, the script locates the folders containing the necessary files, applies all required data transformations, and consolidates the processed information into a single file per year. This final dataset serves as the foundation for the subsequent phases of the project.
"""



if __name__ == '__main__':

    # find all the files to loop
    current_dir = os.getcwd()
    base_dir = os.path.dirname(current_dir)  # Moves up one level
    files_dir = os.path.join(base_dir, "files") 

    if not files_dir:
        print("Error: 'files' directory not found.")

    # loop through every file in directory
    for root, dirs, files in os.walk(files_dir):
        final_file_name = ""
        df_final = pd.DataFrame()
        print(f"Processing directory: {root}")
        os.chdir(root)
        files_csv = glob.glob("*.csv")
        
        if files_csv:
            for file in files_csv:
                print(f"Processing file: {file}")

                # retrieve filename and infer the year from it
                final_file_name = re.search(r'^[^0-9]+', file).group().rstrip('_')
                year = re.search(r'\d{4}', file).group()

                # process the file: read, clean, adjust by gender
                df = pd.read_csv(file, sep=";", encoding='latin-1', skiprows=1, header=[0])
                df = initial_cleaning(df)
                df_female = create_gender_df(df, year, "Feminino")
                df_male = create_gender_df(df, year, "Masculino")
                df_final = pd.concat([df_final, df_female, df_male])
                
            # final adjustments
            df_final = final_adjustments(df_final)
            
            print(f"Saving final file: {final_file_name}.csv")
            df_final.to_csv(f'{final_file_name}.csv', index=False)
            print(f"Processing completed for: {final_file_name}")
