In [1]:
import pandas as pd

df = pd.read_csv('/Users/marcelosilva/Desktop/projectOne/3/D-Variable Analysis/CleanDataset.csv')

In [None]:
import pandas as pd
import re
import os

def parse_variable_descriptions(file_path):
    """
    L√™ o arquivo de descri√ß√µes e extrai nome_vari√°vel + descri√ß√£o
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        print(f"‚úÖ Arquivo carregado: {file_path}")
        
        # Dicion√°rio para armazenar vari√°vel: descri√ß√£o
        variables_dict = {}
        
        # Dividir por linhas
        lines = content.strip().split('\n')
        
        for line in lines:
            line = line.strip()
            if line:
                # Tentar diferentes padr√µes de separa√ß√£o
                # Padr√£o 1: vari√°vel<tab>descri√ß√£o
                if '\t' in line:
                    parts = line.split('\t', 1)
                    if len(parts) == 2:
                        var_name = parts[0].strip()
                        description = parts[1].strip()
                        variables_dict[var_name] = description
                
                # Padr√£o 2: vari√°vel - descri√ß√£o
                elif ' - ' in line:
                    parts = line.split(' - ', 1)
                    if len(parts) == 2:
                        var_name = parts[0].strip()
                        description = parts[1].strip()
                        variables_dict[var_name] = description
                
                # Padr√£o 3: vari√°vel espa√ßos descri√ß√£o
                else:
                    parts = line.split()
                    if len(parts) >= 2:
                        var_name = parts[0].strip()
                        description = ' '.join(parts[1:]).strip()
                        variables_dict[var_name] = description
        
        print(f"üìä Vari√°veis extra√≠das: {len(variables_dict)}")
        
        # Mostrar algumas amostras
        print(f"\nüìã Amostra das vari√°veis encontradas:")
        for i, (var, desc) in enumerate(list(variables_dict.items())[:5]):
            print(f"   {var} ‚Üí {desc}")
        
        return variables_dict
        
    except FileNotFoundError:
        print(f"‚ùå Arquivo n√£o encontrado: {file_path}")
        return None
    except Exception as e:
        print(f"‚ùå Erro ao carregar arquivo: {e}")
        return None

def load_clean_dataset_columns(file_path):
    """
    Carrega apenas as colunas do CleanDataset
    """
    try:
        df = pd.read_csv(file_path, nrows=0)  # S√≥ cabe√ßalho
        columns = df.columns.tolist()
        
        print(f"‚úÖ CleanDataset colunas carregadas: {len(columns)} vari√°veis")
        return columns
        
    except Exception as e:
        print(f"‚ùå Erro ao carregar CleanDataset: {e}")
        return None

def create_simple_crossref(variables_dict, dataset_columns, output_path):
    """
    Cria cruzamento simples: vari√°vel + descri√ß√£o
    """
    print(f"\nüîç CRUZAMENTO SIMPLES")
    print("=" * 30)
    
    # Identificar vari√°veis que est√£o em ambos
    available_variables = []
    
    for var_name in dataset_columns:
        if var_name in variables_dict:
            available_variables.append({
                'variable': var_name,
                'description': variables_dict[var_name]
            })
    
    print(f"‚úÖ Vari√°veis com descri√ß√£o no CleanDataset: {len(available_variables)}")
    
    # Criar arquivo simples
    os.makedirs(output_path, exist_ok=True)
    
    # 1. Arquivo TXT simples
    txt_path = os.path.join(output_path, "variables_for_feature_engineering.txt")
    
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write("VARI√ÅVEIS DISPON√çVEIS PARA FEATURE ENGINEERING\n")
        f.write("=" * 50 + "\n\n")
        
        for item in available_variables:
            f.write(f"{item['variable']}\t{item['description']}\n")
    
    # 2. Arquivo CSV simples
    csv_path = os.path.join(output_path, "variables_for_feature_engineering_simple.csv")
    
    df_simple = pd.DataFrame(available_variables)
    df_simple.to_csv(csv_path, index=False)
    
    # 3. Arquivo Markdown organizado
    md_path = os.path.join(output_path, "variables_for_feature_engineering_simple.md")
    
    with open(md_path, 'w', encoding='utf-8') as f:
        f.write("# Variables for Feature Engineering\n\n")
        f.write(f"## Available Variables ({len(available_variables)})\n\n")
        f.write("| Variable | Description |\n")
        f.write("|----------|-------------|\n")
        
        for item in available_variables:
            # Escapar caracteres especiais do markdown
            desc_clean = item['description'].replace('|', '\\|')
            f.write(f"| `{item['variable']}` | {desc_clean} |\n")
        
        f.write(f"\n## Variables Without Description\n\n")
        
        variables_without_desc = [var for var in dataset_columns if var not in variables_dict]
        
        if variables_without_desc:
            f.write("The following variables are in CleanDataset but lack descriptions:\n\n")
            for var in variables_without_desc:
                f.write(f"- `{var}`\n")
        else:
            f.write("All variables in CleanDataset have descriptions! ‚úÖ\n")
    
    print(f"\nüìÑ Arquivos criados:")
    print(f"   üìù TXT: {txt_path}")
    print(f"   üìä CSV: {csv_path}")
    print(f"   üìã MD: {md_path}")
    
    # Mostrar resultado na tela
    print(f"\nüìã RESULTADO DO CRUZAMENTO:")
    print("=" * 50)
    
    for item in available_variables:
        print(f"{item['variable']}\t{item['description']}")
    
    return available_variables

def main():
    """
    Processo principal simples
    """
    print("CRUZAMENTO SIMPLES: VARI√ÅVEL + DESCRI√á√ÉO")
    print("=" * 50)
    
    # Caminhos
    described_vars_path = "/Users/marcelosilva/Desktop/projectOne/1/D-Data Characteristics/variaveis_descritas_90pct.txt"
    clean_dataset_path = "/Users/marcelosilva/Desktop/projectOne/3/D-Variable Analysis/CleanDataset.csv"
    output_path = "/Users/marcelosilva/Desktop/projectOne/4"
    
    # 1. Carregar descri√ß√µes
    variables_dict = parse_variable_descriptions(described_vars_path)
    
    # 2. Carregar colunas do dataset
    dataset_columns = load_clean_dataset_columns(clean_dataset_path)
    
    if variables_dict is None or dataset_columns is None:
        print("‚ùå Erro ao carregar arquivos!")
        return
    
    # 3. Fazer cruzamento simples
    result = create_simple_crossref(variables_dict, dataset_columns, output_path)
    
    print(f"\nüéâ CONCLU√çDO!")
    print(f"üìä {len(result)} vari√°veis dispon√≠veis para feature engineering")
    print(f"üìÅ Arquivos salvos em: {output_path}")

# EXECUTAR
if __name__ == "__main__":
    main()