# ETL SILVER -> GOLD (PostgreSQL)
**Projeto:** Social Media User Analysis
**Origem:** Schema `silver` (Tabela tratada e limpa)
**Destino:** Schema `dw` (Data Warehouse - Modelagem Galaxy Schema)

**Arquitetura:**
- **Dimensões:** Usuario, EstiloVida, Conta, Interesse
- **Fatos:** Fato_Ads_Performance, Fato_Engajamento_App

In [15]:
import pandas as pd
import psycopg2
from psycopg2.extras import execute_batch
import numpy as np

# --- CONFIGURAÇÕES DE CONEXÃO ---
DB_HOST = "localhost"
DB_NAME = "instagram_usage"
DB_USER = "sbd2"
DB_PASS = "sbd2123"  # Altere para sua senha

# Schemas e Tabelas
SILVER_SCHEMA = "silver"
SILVER_TABLE = "user" # Nome da tabela criada no passo anterior
GOLD_SCHEMA = "dw"

## 1. Extração (Leitura da Silver)

In [16]:
try:
    conn = psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USER, password=DB_PASS)
    
    query = f"SELECT * FROM {SILVER_SCHEMA}.{SILVER_TABLE}"
    df_silver = pd.read_sql(query, conn)
    
    print(f"Dados carregados da Silver: {df_silver.shape}")
    
    conn.close()
except Exception as e:
    print(f"Erro ao ler do banco: {e}")

  df_silver = pd.read_sql(query, conn)


Dados carregados da Silver: (1506286, 58)


## 2. Transformação (Modelagem Dimensional)

In [17]:
# --- 2.1 Criar DataFrames das Dimensões ---

# Dim_Usuario
cols_usuario = ['user_id', 'age', 'gender', 'country', 'urban_rural', 
                'income_level', 'employment_status', 'education_level', 
                'relationship_status', 'has_children']
dim_usuario = df_silver[cols_usuario].drop_duplicates().reset_index(drop=True)
dim_usuario['sk_usuario'] = dim_usuario.index + 1
# Reordena para SK ser a primeira
dim_usuario = dim_usuario[['sk_usuario'] + cols_usuario]

# Dim_EstiloVida
cols_estilovida = ['exercise_hours_per_week', 'sleep_hours_per_night', 'diet_quality', 
                   'smoking', 'alcohol_frequency', 'perceived_stress_score', 
                   'self_reported_happiness', 'body_mass_index', 'blood_pressure_systolic', 
                   'blood_pressure_diastolic', 'daily_steps_count', 'weekly_work_hours', 
                   'hobbies_count', 'social_events_per_month', 'books_read_per_year', 
                   'volunteer_hours_per_month', 'travel_frequency_per_year']
dim_estilovida = df_silver[cols_estilovida].drop_duplicates().reset_index(drop=True)
dim_estilovida['sk_estilovida'] = dim_estilovida.index + 1
dim_estilovida = dim_estilovida[['sk_estilovida'] + cols_estilovida]

# Dim_Conta
cols_conta = ['app_name', 'account_creation_year', 'last_login_date', 
              'uses_premium_features', 'privacy_setting_level', 
              'two_factor_auth_enabled', 'biometric_login_used', 
              'subscription_status']
dim_conta = df_silver[cols_conta].drop_duplicates().reset_index(drop=True)
dim_conta['sk_conta'] = dim_conta.index + 1
dim_conta = dim_conta[['sk_conta'] + cols_conta]

# Dim_Interesse
cols_interesse = ['content_type_preference', 'preferred_content_theme']
dim_interesse = df_silver[cols_interesse].drop_duplicates().reset_index(drop=True)
dim_interesse['sk_interesse'] = dim_interesse.index + 1
dim_interesse = dim_interesse[['sk_interesse'] + cols_interesse]

print("Dimensões preparadas.")

Dimensões preparadas.


In [18]:
# --- 2.2 Preparar Base para Fatos (Merge para obter SKs) ---

fato_base = df_silver.copy()

# Faz o merge com cada dimensão para trazer a SK correspondente
# Nota: Para Usuario usamos user_id. Para as outras, usamos as colunas de atributos.
fato_base = fato_base.merge(dim_usuario[['sk_usuario', 'user_id']], on='user_id', how='left')
fato_base = fato_base.merge(dim_estilovida, on=cols_estilovida, how='left')
fato_base = fato_base.merge(dim_conta, on=cols_conta, how='left')
fato_base = fato_base.merge(dim_interesse, on=cols_interesse, how='left')

sks = ['sk_usuario', 'sk_estilovida', 'sk_conta', 'sk_interesse']

In [19]:
# --- 2.3 Criar DataFrames das Fatos ---

# Fato_Ads_Performance
metrics_ads = ['ads_viewed_per_day', 'ads_clicked_per_day']
fato_ads = fato_base[sks + metrics_ads].copy()

# Fato_Engajamento_App
metrics_engajamento = ['daily_active_minutes_instagram', 'sessions_per_day', 
                       'posts_created_per_week', 'reels_watched_per_day', 
                       'stories_viewed_per_day', 'likes_given_per_day', 
                       'comments_written_per_day', 'dms_sent_per_week', 
                       'dms_received_per_week', 'time_on_feed_per_day', 
                       'time_on_explore_per_day', 'time_on_messages_per_day', 
                       'time_on_reels_per_day', 'followers_count', 
                       'following_count', 'notification_response_rate', 
                       'average_session_length_minutes', 'linked_accounts_count', 
                       'user_engagement_score']
fato_engajamento = fato_base[sks + metrics_engajamento].copy()

print("Fatos preparadas.")

Fatos preparadas.


## 3. Carga (Load into DW)

In [21]:
def create_tables_ddl(cur):
    """
    Cria o schema dw e as tabelas se não existirem.
    """
    cur.execute(f"CREATE SCHEMA IF NOT EXISTS {GOLD_SCHEMA};")
    
    # DDL Dim_Usuario
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {GOLD_SCHEMA}.dim_usuario (
            sk_usuario INT PRIMARY KEY,
            user_id INT,
            age INT,
            gender VARCHAR(50),
            country VARCHAR(100),
            urban_rural VARCHAR(50),
            income_level VARCHAR(50),
            employment_status VARCHAR(100),
            education_level VARCHAR(100),
            relationship_status VARCHAR(100),
            has_children BOOLEAN
        );
    """)
    
    # DDL Dim_Interesse
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {GOLD_SCHEMA}.dim_interesse (
            sk_interesse INT PRIMARY KEY,
            content_type_preference VARCHAR(100),
            preferred_content_theme VARCHAR(100)
        );
    """)
    
    # DDL Dim_Conta
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {GOLD_SCHEMA}.dim_conta (
            sk_conta INT PRIMARY KEY,
            app_name VARCHAR(100),
            account_creation_year INT,
            last_login_date DATE,
            uses_premium_features BOOLEAN,
            privacy_setting_level VARCHAR(50),
            two_factor_auth_enabled BOOLEAN,
            biometric_login_used BOOLEAN,
            subscription_status VARCHAR(50)
        );
    """)
    
    # DDL Dim_EstiloVida (Simplificada para o exemplo, adicione todos os campos se necessário)
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {GOLD_SCHEMA}.dim_estilovida (
            sk_estilovida INT PRIMARY KEY,
            exercise_hours_per_week FLOAT,
            sleep_hours_per_night FLOAT,
            diet_quality VARCHAR(50),
            smoking BOOLEAN,
            alcohol_frequency VARCHAR(50),
            perceived_stress_score INT,
            self_reported_happiness INT,
            body_mass_index FLOAT,
            blood_pressure_systolic INT,
            blood_pressure_diastolic INT,
            daily_steps_count INT,
            weekly_work_hours FLOAT,
            hobbies_count INT,
            social_events_per_month INT,
            books_read_per_year INT,
            volunteer_hours_per_month FLOAT,
            travel_frequency_per_year INT
        );
    """)
    
    # DDL Fato_Ads_Performance
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {GOLD_SCHEMA}.fato_ads_performance (
            sk_usuario INT,
            sk_estilovida INT,
            sk_conta INT,
            sk_interesse INT,
            ads_viewed_per_day FLOAT,
            ads_clicked_per_day FLOAT
        );
    """)
    
    # DDL Fato_Engajamento_App
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {GOLD_SCHEMA}.fato_engajamento_app (
            sk_usuario INT,
            sk_estilovida INT,
            sk_conta INT,
            sk_interesse INT,
            daily_active_minutes_instagram FLOAT,
            sessions_per_day INT,
            posts_created_per_week INT,
            reels_watched_per_day INT,
            stories_viewed_per_day INT,
            likes_given_per_day INT,
            comments_written_per_day INT,
            dms_sent_per_week INT,
            dms_received_per_week INT,
            time_on_feed_per_day FLOAT,
            time_on_explore_per_day FLOAT,
            time_on_messages_per_day FLOAT,
            time_on_reels_per_day FLOAT,
            followers_count INT,
            following_count INT,
            notification_response_rate FLOAT,
            average_session_length_minutes FLOAT,
            linked_accounts_count INT,
            user_engagement_score FLOAT
        );
    """)


In [22]:
def insert_data(cur, df, table_name):
    """
    Função genérica para inserir DataFrame no banco usando execute_batch.
    Corrige tipos numpy para tipos nativos do Python.
    """
    if df.empty:
        return
    
    cols = df.columns.tolist()
    cols_str = ", ".join(cols)
    placeholders = ", ".join(["%s"] * len(cols))
    
    sql = f"INSERT INTO {GOLD_SCHEMA}.{table_name} ({cols_str}) VALUES ({placeholders})"
    
    # CORREÇÃO AQUI: .to_numpy().tolist() força a conversão de int64 para int python
    # O replace lida com NaNs convertendo para None (NULL no SQL)
    data = [tuple(x) for x in df.replace({np.nan: None}).to_numpy().tolist()]
    
    execute_batch(cur, sql, data)
    print(f"Inseridos {len(data)} registros em {table_name}")

# --- Execução da Carga ---
try:
    conn = psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USER, password=DB_PASS)
    cur = conn.cursor()
    
    # 1. Criar Tabelas
    create_tables_ddl(cur)
    
    # 2. Limpar tabelas existentes (Full Refresh Strategy)
    # A ordem importa para evitar erro de Foreign Key (limpar fatos antes das dimensões)
    tables_fact = ['fato_ads_performance', 'fato_engajamento_app']
    tables_dim = ['dim_usuario', 'dim_estilovida', 'dim_conta', 'dim_interesse']
    
    for t in tables_fact + tables_dim:
        cur.execute(f"TRUNCATE TABLE {GOLD_SCHEMA}.{t} CASCADE;")
    
    # 3. Inserir Dimensões
    insert_data(cur, dim_usuario, 'dim_usuario')
    insert_data(cur, dim_estilovida, 'dim_estilovida')
    insert_data(cur, dim_conta, 'dim_conta')
    insert_data(cur, dim_interesse, 'dim_interesse')
    
    # 4. Inserir Fatos
    insert_data(cur, fato_ads, 'fato_ads_performance')
    insert_data(cur, fato_engajamento, 'fato_engajamento_app')
    
    conn.commit()
    print("\nCarga DW concluída com sucesso!")

except Exception as e:
    conn.rollback()
    print(f"Erro na carga: {e}")
finally:
    if conn:
        conn.close()

Inseridos 1506286 registros em dim_usuario
Inseridos 1506286 registros em dim_estilovida
Inseridos 315973 registros em dim_conta
Inseridos 48 registros em dim_interesse
Inseridos 1506286 registros em fato_ads_performance
Inseridos 1506286 registros em fato_engajamento_app

Carga DW concluída com sucesso!
