In [13]:
import pandas as pd
import xlrd
import psycopg2
from sqlalchemy import create_engine
import boto3
import botocore
import requests
import os

## CLEANING AND UPLOADING FILES TO NEW BUCKET

In [22]:
# Define the S3 bucket URLs
source_bucket_url = 'https://inep.s3.amazonaws.com/'
destination_bucket_url = 'https://inep-cleaned.s3.amazonaws.com/'

# List of years for your files
years = ['2020', '2021', '2022']

for year in years:
    # Step 1: Download the file
    print(f"Step 1: Downloading {year} file...")
    source_key = f'MICRODADOS_ENEM_{year}.csv'
    destination_key = f'summary_{source_key}'
    
    source_object_url = source_bucket_url + source_key
    response = requests.get(source_object_url)
    
    if response.status_code == 200:
        with open(source_key, 'wb') as local_file:
            local_file.write(response.content)
    else:
        print(f"Failed to download {year} file. Status code: {response.status_code}")
    
        """# Print the response content for more details if available
        if response.content:
            print("Response Content:")
            print(response.content.decode('utf-8'))

        # Print headers for additional information if available
        if response.headers:
            print("Response Headers:")
            for header, value in response.headers.items():
                print(f"{header}: {value}")"""

    # Step 2: Create a DataFrame and select specific columns
    print(f"Step 2: Creating DataFrame for {year} file...")
    columns_to_keep = [
        'NU_INSCRICAO', 'NU_ANO', 'TP_FAIXA_ETARIA', 'TP_SEXO', 'TP_ESTADO_CIVIL', 'TP_COR_RACA',
        'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO',
        'IN_TREINEIRO', 'CO_MUNICIPIO_PROVA', 'NO_MUNICIPIO_PROVA', 'CO_UF_PROVA', 'SG_UF_PROVA',
        'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT', 'NU_NOTA_REDACAO'
    ]
    
    df = pd.read_csv(source_key, sep=';', encoding='latin1', usecols=columns_to_keep)
    
    # Step 3: Rename columns
    print(f"Step 3: Renaming columns for {year} file...")
    df.rename(columns={
        'NU_INSCRICAO': 'id', 'NU_ANO': 'year', 'TP_FAIXA_ETARIA': 'age_code', 'TP_SEXO': 'sex_code',
        'TP_ESTADO_CIVIL': 'civil_code', 'TP_COR_RACA': 'etinicity_code', 'TP_NACIONALIDADE': 'nationality_code',
        'TP_ST_CONCLUSAO': 'conclusion_code', 'TP_ANO_CONCLUIU': 'concluion_year', 'TP_ESCOLA': 'school_code',
        'TP_ENSINO': 'teaching_code', 'IN_TREINEIRO': 'is_training', 'CO_MUNICIPIO_PROVA': 'municipality_code',
        'NO_MUNICIPIO_PROVA': 'municipality_name', 'CO_UF_PROVA': 'uf_code', 'SG_UF_PROVA': 'uf_name',
        'NU_NOTA_CN': 'cn_score', 'NU_NOTA_CH': 'ch_score', 'NU_NOTA_LC': 'lc_score',
        'NU_NOTA_MT': 'mt_score', 'NU_NOTA_REDACAO': 'essay_score'
    }, inplace=True)
    
    # Step 4: Export the cleaned data to a new CSV
    print(f"Step 4: Exporting cleaned data for {year}...")
    df.to_csv(destination_key, index=False)
    
    # Step 5: Upload the CSV to the destination bucket
    print(f"Step 5: Uploading cleaned data to {destination_bucket_url}...")
    destination_object_url = destination_bucket_url + destination_key
    with open(destination_key, 'rb') as local_file:
        response = requests.put(destination_object_url, data=local_file)
    
    if response.status_code == 200:
        print(f"Cleaning and uploading for {year} is complete!\n")
    else:
        print(f"Failed to upload {year} file. Status code: {response.status_code}")


Step 1: Downloading 2020 file...
Step 2: Creating DataFrame for 2020 file...
Step 3: Renaming columns for 2020 file...
Step 4: Exporting cleaned data for 2020...
Step 5: Uploading cleaned data to https://inep-cleaned.s3.amazonaws.com/...
Cleaning and uploading for 2020 is complete!

Step 1: Downloading 2021 file...
Step 2: Creating DataFrame for 2021 file...
Step 3: Renaming columns for 2021 file...
Step 4: Exporting cleaned data for 2021...
Step 5: Uploading cleaned data to https://inep-cleaned.s3.amazonaws.com/...
Cleaning and uploading for 2021 is complete!

Step 1: Downloading 2022 file...
Step 2: Creating DataFrame for 2022 file...
Step 3: Renaming columns for 2022 file...
Step 4: Exporting cleaned data for 2022...
Step 5: Uploading cleaned data to https://inep-cleaned.s3.amazonaws.com/...
Cleaning and uploading for 2022 is complete!



In [24]:
# Cleanup: Delete the downloaded files
os.remove(source_key)
os.remove(destination_key)

## UPLOADING DATA TO POSTGRES