In [1]:
import pandas as pd

df = pd.read_csv('/Users/marcelosilva/Desktop/projectOne/3/C-Variable Analysis/complete_cases_dataset.csv')

In [2]:
# Show shape of the dataframe
print("Shape of the dataset:", df.shape)

# Check for NaN values in each column
print("\nNumber of NaN values in each column:")
print(df.isna().sum())

Shape of the dataset: (4339, 24)

Number of NaN values in each column:
id_anon                   0
b02_sexo                  0
b04_idade                 0
bb04_idade_da_mae         0
d01_cor                   0
h01_semanas_gravidez      0
h02_peso                  0
h03_altura                0
h04_parto                 0
j03_cor                   0
k04_prenatal_semanas      0
k05_prenatal_consultas    0
k06_peso_engravidar       0
k07_peso_final            0
k08_quilos                0
k12_tempo                 0
k13_tempo_medida          0
k15_recebeu               0
k16_liquido               0
k18_somente               0
k19_somente_medida        0
t05_altura_medida1        0
t06_altura_medida2        0
vd_zimc                   0
dtype: int64



# Data Cleaning: Handling Missing Responses in Categorical Variables

In this step, we will remove the "Don't know/No response" categories from the categorical variables in our dataset. This cleaning process is essential to:

- Improve data quality
- Focus on valid responses
- Reduce noise in our analysis
- Prepare the data for meaningful statistical analysis

The categorical variables that will be cleaned are:
- b02_sexo (Sex)
- d01_cor (Mother's skin color)
- h04_parto (Type of delivery)
- j03_cor (Child's skin color)
- k13_tempo_medida (Time measure unit)
- k15_recebeu (Received)
- k16_liquido (Liquid)
- k19_somente_medida (Only measure)
```

In [3]:
# Get list of categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# For each categorical column, show value counts and percentage
for column in categorical_columns:
    print(f"\nDistribution for {column}:")
    value_counts = df[column].value_counts()
    value_percentages = df[column].value_counts(normalize=True) * 100
    
    # Combine counts and percentages
    summary = pd.DataFrame({
        'Count': value_counts,
        'Percentage': value_percentages
    })
    print(summary)


Distribution for b02_sexo:
           Count  Percentage
b02_sexo                    
Masculino   2207   50.864254
Feminino    2132   49.135746

Distribution for d01_cor:
                                                    Count  Percentage
d01_cor                                                              
Parda (mulata, cabocla, cafuza, mameluca ou mes...   2267   52.247062
Branca                                               1753   40.401014
Preta                                                 292    6.729661
Amarela (origem japonesa, chinesa, coreana etc.)       19    0.437889
Ind√≠gena                                                8    0.184374

Distribution for h04_parto:
                                      Count  Percentage
h04_parto                                              
Normal                                 2179   50.218944
Cesariana de urg√™ncia (N√£o agendada)   1142   26.319428
Cesariana agendada (eletiva)           1018   23.461627

Distribution for j03_cor:


In [4]:

import numpy as np
import os

def create_clean_dataset(df, output_path):
    """
    Cria CleanDataset.csv removendo todos os casos com 'N√£o sabe/N√£o quis responder'
    """
    
    # Verificar se o diret√≥rio existe, se n√£o, criar
    os.makedirs(output_path, exist_ok=True)
    
    print("CRIANDO CLEANDATASET.CSV")
    print("=" * 40)
    
    # Informa√ß√µes iniciais
    initial_count = len(df)
    print(f"üìä Dataset inicial: {initial_count:,} pacientes")
    
    # Fazer c√≥pia do dataset
    df_clean = df.copy()
    
    # Identificar e contar casos com "N√£o sabe/N√£o quis responder"
    patterns_to_remove = [
        'N√£o sabe/N√£o quis responder',
        'N√£o sabe',
        'N√£o quis responder',
        'NS/NR'
    ]
    
    print(f"\nüîç Procurando padr√µes de missing categ√≥rico:")
    
    total_cases_to_remove = set()  # Usar set para evitar duplicatas
    removal_details = {}
    
    # Verificar cada coluna categ√≥rica
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    
    for col in categorical_cols:
        col_removals = 0
        found_patterns = []
        
        for pattern in patterns_to_remove:
            # Buscar padr√£o (case insensitive)
            mask = df_clean[col].str.contains(pattern, case=False, na=False)
            matching_cases = df_clean[mask].index.tolist()
            
            if len(matching_cases) > 0:
                # Adicionar √≠ndices ao set de casos para remover
                total_cases_to_remove.update(matching_cases)
                col_removals += len(matching_cases)
                found_patterns.append(f"'{pattern}': {len(matching_cases)}")
        
        if found_patterns:
            removal_details[col] = {
                'patterns': found_patterns,
                'total_in_column': col_removals
            }
            print(f"   {col}: {', '.join(found_patterns)}")
    
    # Mostrar resumo dos casos a serem removidos
    total_unique_removals = len(total_cases_to_remove)
    
    if total_unique_removals == 0:
        print("‚úÖ Nenhum caso com missing categ√≥rico encontrado!")
        df_final = df_clean
    else:
        print(f"\n‚úÇÔ∏è Removendo casos:")
        print(f"   Total de casos √∫nicos a remover: {total_unique_removals}")
        
        # Remover casos identificados
        df_final = df_clean.drop(index=list(total_cases_to_remove))
        
        print(f"   Casos removidos: {initial_count - len(df_final)}")
        print(f"   Casos restantes: {len(df_final):,}")
    
    # Verifica√ß√£o final - confirmar que n√£o h√° mais missing categ√≥ricos
    print(f"\nüî¨ Verifica√ß√£o final:")
    remaining_missing = 0
    
    for col in df_final.select_dtypes(include=['object']).columns:
        for pattern in patterns_to_remove:
            count = df_final[col].str.contains(pattern, case=False, na=False).sum()
            remaining_missing += count
    
    if remaining_missing == 0:
        print("‚úÖ Confirmado: Nenhum missing categ√≥rico restante!")
    else:
        print(f"‚ö†Ô∏è Ainda existem {remaining_missing} missing categ√≥ricos!")
    
    # Definir caminho do arquivo
    filename = "CleanDataset.csv"
    full_path = os.path.join(output_path, filename)
    
    # Salvar o dataset
    print(f"\nüíæ Salvando CleanDataset.csv:")
    print(f"üìÅ {full_path}")
    
    try:
        df_final.to_csv(full_path, index=False)
        print("‚úÖ CleanDataset.csv salvo com sucesso!")
        
        # Informa√ß√µes do arquivo
        file_size = os.path.getsize(full_path) / (1024 * 1024)  # MB
        
        print(f"\nüìä CLEANDATASET.CSV - INFORMA√á√ïES:")
        print("=" * 40)
        print(f"üìà Linhas: {len(df_final):,}")
        print(f"üìà Colunas: {len(df_final.columns)}")
        print(f"üìà Tamanho do arquivo: {file_size:.2f} MB")
        print(f"üìà Missing values num√©ricos: {df_final.isnull().sum().sum()}")
        print(f"üìà Missing values categ√≥ricos: {remaining_missing}")
        
        # Taxa de reten√ß√£o
        retention_rate = (len(df_final) / initial_count) * 100
        print(f"üìà Taxa de reten√ß√£o: {retention_rate:.2f}%")
        
        # Resumo da qualidade dos dados
        print(f"\nüèÜ QUALIDADE DO DATASET:")
        if df_final.isnull().sum().sum() == 0 and remaining_missing == 0:
            print("‚úÖ DATASET COMPLETAMENTE LIMPO!")
            print("   - 0 missing values num√©ricos")
            print("   - 0 missing values categ√≥ricos")
            print("   - 100% dados genu√≠nos")
        else:
            print("‚ö†Ô∏è Dataset ainda tem alguns problemas")
        
        # Mostrar distribui√ß√£o final das vari√°veis que tinham problemas
        problem_vars = ['k15_recebeu', 'k16_liquido']
        
        print(f"\nüìã Distribui√ß√£o final das vari√°veis corrigidas:")
        for var in problem_vars:
            if var in df_final.columns:
                print(f"\n{var}:")
                value_counts = df_final[var].value_counts()
                for value, count in value_counts.items():
                    pct = (count / len(df_final)) * 100
                    print(f"   {value}: {count:,} ({pct:.1f}%)")
        
    except Exception as e:
        print(f"‚ùå Erro ao salvar CleanDataset.csv: {e}")
        return None
    
    return df_final

def validate_clean_dataset(df_clean):
    """
    Valida√ß√£o completa do CleanDataset
    """
    print(f"\nüî¨ VALIDA√á√ÉO COMPLETA DO CLEANDATASET")
    print("=" * 50)
    
    # 1. Verificar shape
    print(f"üìä Shape: {df_clean.shape}")
    
    # 2. Verificar tipos de dados
    print(f"üìä Tipos de vari√°veis:")
    print(f"   Num√©ricas: {len(df_clean.select_dtypes(include=[np.number]).columns)}")
    print(f"   Categ√≥ricas: {len(df_clean.select_dtypes(include=['object']).columns)}")
    
    # 3. Verificar missing values
    total_missing = df_clean.isnull().sum().sum()
    print(f"üìä Missing values: {total_missing}")
    
    # 4. Verificar valores √∫nicos das vari√°veis categ√≥ricas
    print(f"\nüìã Resumo das vari√°veis categ√≥ricas:")
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    
    for col in categorical_cols:
        unique_count = df_clean[col].nunique()
        most_frequent = df_clean[col].mode().iloc[0] if len(df_clean[col].mode()) > 0 else "N/A"
        print(f"   {col}: {unique_count} categorias √∫nicas, mais frequente: '{most_frequent}'")
    
    # 5. Verificar se h√° padr√µes suspeitos restantes
    suspicious_patterns = ['999', '99', 'N√£o sabe', 'NS', 'NR', 'Missing']
    
    found_suspicious = False
    for col in categorical_cols:
        for pattern in suspicious_patterns:
            if df_clean[col].astype(str).str.contains(pattern, case=False, na=False).any():
                print(f"‚ö†Ô∏è Padr√£o suspeito '{pattern}' encontrado em {col}")
                found_suspicious = True
    
    if not found_suspicious:
        print(f"‚úÖ Nenhum padr√£o suspeito encontrado!")
    
    # 6. Estat√≠sticas b√°sicas
    print(f"\nüìä Estat√≠sticas b√°sicas das vari√°veis num√©ricas:")
    numeric_summary = df_clean.describe()
    print(f"   Vari√°veis num√©ricas: {len(numeric_summary.columns)}")
    print(f"   Observa√ß√µes por vari√°vel: {numeric_summary.loc['count'].min():.0f} - {numeric_summary.loc['count'].max():.0f}")
    
    return total_missing == 0 and not found_suspicious

# EXECUTAR CRIA√á√ÉO DO CLEANDATASET
# =================================

output_directory = "/Users/marcelosilva/Desktop/projectOne/3/D-Variable Analysis"

print("PROCESSO DE CRIA√á√ÉO DO CLEANDATASET.CSV")
print("=" * 60)

# Criar CleanDataset.csv
clean_dataset = create_clean_dataset(df, output_directory)

# Validar dataset criado
if clean_dataset is not None:
    is_valid = validate_clean_dataset(clean_dataset)
    
    if is_valid:
        print(f"\nüéâ CLEANDATASET.CSV CRIADO COM SUCESSO!")
        print(f"üìÇ Arquivo: CleanDataset.csv")
        print(f"üìç Localiza√ß√£o: {output_directory}")
        print(f"üìä Shape final: {clean_dataset.shape}")
        print(f"üèÜ Qualidade: 100% dados limpos e completos!")
    else:
        print(f"\n‚ö†Ô∏è CleanDataset criado mas precisa de revis√£o!")
    
    # Mostrar amostra do dataset final
    print(f"\nüìã AMOSTRA DO CLEANDATASET (5 primeiras linhas):")
    print("=" * 60)
    
    # Mostrar algumas colunas representativas
    sample_cols = ['id_anon', 'b02_sexo', 'd01_cor', 'h04_parto', 'k15_recebeu', 'k16_liquido']
    available_cols = [col for col in sample_cols if col in clean_dataset.columns]
    
    if available_cols:
        print(clean_dataset[available_cols].head())
else:
    print("‚ùå Falha na cria√ß√£o do CleanDataset.csv")

PROCESSO DE CRIA√á√ÉO DO CLEANDATASET.CSV
CRIANDO CLEANDATASET.CSV
üìä Dataset inicial: 4,339 pacientes

üîç Procurando padr√µes de missing categ√≥rico:
   k15_recebeu: 'N√£o sabe/N√£o quis responder': 35, 'N√£o sabe': 35, 'N√£o quis responder': 35
   k16_liquido: 'N√£o sabe/N√£o quis responder': 31, 'N√£o sabe': 31, 'N√£o quis responder': 31

‚úÇÔ∏è Removendo casos:
   Total de casos √∫nicos a remover: 52
   Casos removidos: 52
   Casos restantes: 4,287

üî¨ Verifica√ß√£o final:
‚úÖ Confirmado: Nenhum missing categ√≥rico restante!

üíæ Salvando CleanDataset.csv:
üìÅ /Users/marcelosilva/Desktop/projectOne/3/D-Variable Analysis/CleanDataset.csv
‚úÖ CleanDataset.csv salvo com sucesso!

üìä CLEANDATASET.CSV - INFORMA√á√ïES:
üìà Linhas: 4,287
üìà Colunas: 24
üìà Tamanho do arquivo: 0.81 MB
üìà Missing values num√©ricos: 0
üìà Missing values categ√≥ricos: 0
üìà Taxa de reten√ß√£o: 98.80%

üèÜ QUALIDADE DO DATASET:
‚úÖ DATASET COMPLETAMENTE LIMPO!
   - 0 missing values num√©rico