# üöÄ ED-COPILOT: PREPARA√á√ÉO DE DADOS PARA TREINAMENTO

In [1]:
import sys
sys.path.append('../src')

import pandas as pd
from models.linearization import FeatureLinearizer
from pathlib import Path

In [2]:
print("="*80)
print("üöÄ ED-COPILOT: PREPARA√á√ÉO DE DADOS PARA TREINAMENTO")
print("="*80)

# 1. Carregar dados
print("\nüìÇ Carregando dados processados...")
train_df = pd.read_parquet('../data/processed/multimodal_train.parquet')
val_df = pd.read_parquet('../data/processed/multimodal_val.parquet')
test_df = pd.read_parquet('../data/processed/multimodal_test.parquet')

print(f"   ‚úÖ Train: {len(train_df):,}")
print(f"   ‚úÖ Val: {len(val_df):,}")
print(f"   ‚úÖ Test: {len(test_df):,}")

# 2. Verificar estrutura
print("\nüîç Verificando estrutura dos dados...")
print(f"   - Colunas triage: {len([c for c in train_df.columns if c.startswith('triage_')])}")
print(f"   - Colunas lab: {len([c for c in train_df.columns if c.startswith('lab_')])}")
print(f"   - Com texto: {train_df['has_text'].sum():,} ({train_df['has_text'].mean()*100:.1f}%)")

üöÄ ED-COPILOT: PREPARA√á√ÉO DE DADOS PARA TREINAMENTO

üìÇ Carregando dados processados...
   ‚úÖ Train: 151,326
   ‚úÖ Val: 18,916
   ‚úÖ Test: 18,916

üîç Verificando estrutura dos dados...
   - Colunas triage: 13
   - Colunas lab: 38
   - Com texto: 49,599 (32.8%)


In [4]:
# 3. Linearizar
print("\nüîÑ Linearizando features...")
linearizer = FeatureLinearizer()

train_linear = linearizer.create_training_examples(train_df)
val_linear = linearizer.create_training_examples(val_df)
test_linear = linearizer.create_training_examples(test_df)

# 4. Estat√≠sticas
print("\n" + "="*80)
print("üìä ESTAT√çSTICAS DO DATASET LINEARIZADO")
print("="*80)

print(f"\nüî¢ Tamanhos:")
print(f"   - Train: {len(train_linear):,}")
print(f"   - Val: {len(val_linear):,}")
print(f"   - Test: {len(test_linear):,}")

print(f"\nüìè Sequ√™ncias:")
avg_tokens = train_linear['text_sequence'].str.split().str.len().mean()
max_tokens = train_linear['text_sequence'].str.split().str.len().max()
print(f"   - Tokens m√©dios: {avg_tokens:.1f}")
print(f"   - Tokens m√°ximos: {max_tokens}")
print(f"   - Sequ√™ncias > 656 tokens: {(train_linear['text_sequence'].str.split().str.len() > 656).sum()}")

print(f"\nüß™ Laborat√≥rios:")
print(f"   - N√∫mero m√©dio de grupos: {train_linear['num_labs'].mean():.2f}")
print(f"   - Time-cost m√©dio: {train_linear['total_time_cost'].mean():.1f} min")
print(f"   - Stays sem labs: {(train_linear['num_labs'] == 0).sum():,} ({(train_linear['num_labs'] == 0).mean()*100:.1f}%)")

print(f"\nüéØ Labels:")
print(f"   - Outcome positivo: {train_linear['outcome'].sum():,} ({train_linear['outcome'].mean()*100:.2f}%)")

print(f"\nüìù Texto cl√≠nico:")
print(f"   - Com texto: {train_linear['has_text'].sum():,} ({train_linear['has_text'].mean()*100:.1f}%)")



üîÑ Linearizando features...
üîÑ Linearizando sequ√™ncias...
   Processados 10,000 / 151,326
   Processados 20,000 / 151,326
   Processados 30,000 / 151,326
   Processados 40,000 / 151,326
   Processados 50,000 / 151,326
   Processados 60,000 / 151,326
   Processados 70,000 / 151,326
   Processados 80,000 / 151,326
   Processados 90,000 / 151,326
   Processados 100,000 / 151,326
   Processados 110,000 / 151,326
   Processados 120,000 / 151,326
   Processados 130,000 / 151,326
   Processados 140,000 / 151,326
   Processados 150,000 / 151,326

‚úÖ Lineariza√ß√£o completa!
   - Total: 151,326 exemplos
   - Com labs: 61,280 (40.5%)
   - Com texto: 49,599 (32.8%)
üîÑ Linearizando sequ√™ncias...
   Processados 10,000 / 18,916

‚úÖ Lineariza√ß√£o completa!
   - Total: 18,916 exemplos
   - Com labs: 7,605 (40.2%)
   - Com texto: 6,146 (32.5%)
üîÑ Linearizando sequ√™ncias...
   Processados 10,000 / 18,916

‚úÖ Lineariza√ß√£o completa!
   - Total: 18,916 exemplos
   - Com labs: 7,569 (40.0%

In [5]:
# 5. Distribui√ß√£o de grupos
print(f"\nüî¨ Distribui√ß√£o de Grupos (Train):")
all_groups = [g for groups in train_linear['lab_groups'] for g in groups]
from collections import Counter
group_counts = Counter(all_groups)
for group, count in group_counts.most_common():
    pct = count / len(train_linear) * 100
    print(f"   - {group:12s}: {count:6,} ({pct:5.1f}%)")

# 6. Salvar
print("\nüíæ Salvando datasets linearizados...")
output_dir = Path('../data/processed/linearized')
output_dir.mkdir(exist_ok=True, parents=True)

train_linear.to_parquet(output_dir / 'train.parquet')
val_linear.to_parquet(output_dir / 'val.parquet')
test_linear.to_parquet(output_dir / 'test.parquet')

print(f"   ‚úÖ Salvos em: {output_dir}")


üî¨ Distribui√ß√£o de Grupos (Train):
   - BLOOD_GAS   : 49,371 ( 32.6%)
   - CHEM        : 48,365 ( 32.0%)
   - CBC         : 47,442 ( 31.4%)
   - LYTES       : 31,537 ( 20.8%)
   - COAG        : 28,827 ( 19.0%)
   - LFTS        : 22,021 ( 14.6%)
   - UA          : 20,398 ( 13.5%)
   - LACTATE     : 19,567 ( 12.9%)
   - LIPASE      : 10,217 (  6.8%)
   - INFLAM      : 10,140 (  6.7%)
   - CARDIO      :  3,382 (  2.2%)

üíæ Salvando datasets linearizados...
   ‚úÖ Salvos em: ../data/processed/linearized


In [6]:
# 7. Exemplo
print("\n" + "="*80)
print("üìù EXEMPLO DE SEQU√äNCIA LINEARIZADA")
print("="*80)

# Pegar exemplo com labs e texto
example = train_linear[
    (train_linear['num_labs'] > 0) & 
    (train_linear['has_text'])
].iloc[0]

print(f"\nStay ID: {example['stay_id']}")
print(f"Outcome: {'POSITIVO' if example['outcome'] == 1 else 'NEGATIVO'}")
print(f"Grupos realizados: {example['lab_groups']}")
print(f"Time-cost total: {example['total_time_cost']} min")
print(f"N√∫mero de tokens: {len(example['text_sequence'].split())}")

print(f"\n{'‚îÄ'*80}")
print("Sequ√™ncia Tabular:")
print(f"{'‚îÄ'*80}")
print(example['text_sequence'][:800])
if len(example['text_sequence']) > 800:
    print("...")

print(f"\n{'‚îÄ'*80}")
print("Texto Cl√≠nico (primeiros 500 chars):")
print(f"{'‚îÄ'*80}")
print(example['clinical_text'][:500] + "...")

print("\n" + "="*80)
print("‚úÖ PREPARA√á√ÉO CONCLU√çDA!")
print("="*80)
print("\nüéØ Pr√≥ximos passos:")
print("   1. python scripts/train_sft.py")
print("   2. Avaliar modelo base")
print("   3. Implementar RL")



üìù EXEMPLO DE SEQU√äNCIA LINEARIZADA

Stay ID: 38468768
Outcome: NEGATIVO
Grupos realizados: ['CBC', 'CHEM', 'LYTES', 'BLOOD_GAS']
Time-cost total: 191 min
N√∫mero de tokens: 50

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Sequ√™ncia Tabular:
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
gender: Female | age: 84 | heart_rate: 88.0 | respiratory_rate: 14.0 | sbp: 138.0 | dbp: 85.0 | temperature: 99.6 | spo2: 100.0 | acuity: 3.0 | pain: 0.0 | chief_complaint: Weakness | [EOS] CBC: 0.10000000149011612 | [EOS] CHEM: 12.0 | [EOS] LYTES: 1.600000023841858 | [EOS] BLOOD_GAS: 84.0 | [EOS]

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚