# ‚ö° SOLU√á√ÉO R√ÅPIDA - REPLICAR O QUE DEU 78.26% + 1 AJUSTE

**Objetivo:** Replicar exatamente o que funcionou (78.26%) e fazer 1 ajuste para 80%

**Estrat√©gia:** RFECV + 3 configura√ß√µes r√°pidas de RandomForest


In [8]:
# ============================================================================
# SOLU√á√ÉO R√ÅPIDA - REPLICAR O QUE DEU 78.26% + 1 AJUSTE
# ============================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("‚ö° SOLU√á√ÉO R√ÅPIDA - Replicando o que deu 78.26%!")
print(f"Random State: {RANDOM_STATE}")


‚ö° SOLU√á√ÉO R√ÅPIDA - Replicando o que deu 78.26%!
Random State: 42


In [9]:
# ============================================================================
# 1. CARREGAMENTO
# ============================================================================
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

print(f'Train: {train.shape}, Test: {test.shape}')
print(f'Distribui√ß√£o de classes:')
print(train['labels'].value_counts(normalize=True))


Train: (646, 33), Test: (277, 32)
Distribui√ß√£o de classes:
labels
1    0.647059
0    0.352941
Name: proportion, dtype: float64


In [10]:
# ============================================================================
# 2. PR√â-PROCESSAMENTO (IGUAL AO QUE FUNCIONOU)
# ============================================================================

def feature_engineering(df):
    df = df.copy()
    
    # Tratamento de nulos
    df['age_first_milestone_year'].fillna(df['age_first_funding_year'] + 1, inplace=True)
    df['age_last_milestone_year'].fillna(df['age_last_funding_year'], inplace=True)
    df['avg_participants'] = df.groupby('funding_rounds')['avg_participants'].transform(
        lambda x: x.fillna(x.median())
    )
    df['avg_participants'].fillna(df['avg_participants'].median(), inplace=True)
    
    # Features essenciais
    df['funding_efficiency'] = df['funding_total_usd'] / (df['funding_rounds'] + 1)
    age_last_plus = df['age_last_funding_year'] + 1
    df['relationship_density'] = df['relationships'] / age_last_plus
    df['milestone_per_year'] = df['milestones'] / age_last_plus
    age_diff = (df['age_last_funding_year'] - df['age_first_funding_year'] + 1).replace(0, 0.1)
    df['funding_velocity'] = df['funding_total_usd'] / age_diff
    
    investor_cols = ['has_VC', 'has_angel', 'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD']
    df['investor_diversity'] = df[investor_cols].sum(axis=1)
    df['funding_log'] = np.log1p(df['funding_total_usd'])
    df['has_early_milestone'] = (df['age_first_milestone_year'] < 2).astype(int)
    df['milestone_to_funding_ratio'] = df['milestones'] / (df['funding_rounds'] + 1)
    
    # Remover colunas
    drop_cols = ['id', 'category_code', 'is_othercategory', 'is_consulting', 'is_otherstate']
    df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)
    
    # Limpar
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for col in df.select_dtypes(include=[np.number]).columns:
        if df[col].isna().any():
            df[col].fillna(df[col].median(), inplace=True)
    
    return df

print("üîß Aplicando feature engineering (igual ao que funcionou)...")

train_fe = feature_engineering(train)
test_fe = feature_engineering(test)

X_train = train_fe.drop('labels', axis=1)
y_train = train_fe['labels']
X_test = test_fe.copy()

common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'‚úÖ Features: {X_train_scaled.shape[1]}')


üîß Aplicando feature engineering (igual ao que funcionou)...
‚úÖ Features: 35


In [11]:
# ============================================================================
# 3. RFECV (O QUE DEU 78.26%) + AJUSTE
# ============================================================================

print('\n‚è≥ Executando RFECV (pode levar 5-10 minutos)...')

# Base estimator
rf_base = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

# RFECV
cv_rfecv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
rfecv = RFECV(
    estimator=rf_base,
    step=1,
    cv=cv_rfecv,
    scoring='accuracy',
    n_jobs=-1
)

rfecv.fit(X_train_scaled, y_train)

print(f'‚úÖ Features √≥timas: {rfecv.n_features_}')
print(f'‚úÖ CV Score: {rfecv.cv_results_["mean_test_score"].max():.4f}')

# Transformar
X_train_rfecv = rfecv.transform(X_train_scaled)
X_test_rfecv = rfecv.transform(X_test_scaled)



‚è≥ Executando RFECV (pode levar 5-10 minutos)...
‚úÖ Features √≥timas: 26
‚úÖ CV Score: 0.7724


In [12]:
# ============================================================================
# 4. TREINAMENTO FINAL COM AJUSTE DE HIPERPAR√ÇMETROS
# ============================================================================

# TESTAR 3 CONFIGURA√á√ïES R√ÅPIDAS
configs = [
    {'n_estimators': 400, 'max_depth': 16},  # Mais √°rvores
    {'n_estimators': 350, 'max_depth': 18},  # Mais profundidade
    {'n_estimators': 300, 'max_depth': 15},  # Original (baseline)
]

best_score = 0
best_model = None

for i, config in enumerate(configs):
    rf = RandomForestClassifier(
        **config,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        class_weight='balanced',
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
    
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)
    scores = cross_val_score(rf, X_train_rfecv, y_train, cv=cv, scoring='accuracy')
    
    print(f'Config {i+1}: CV = {scores.mean():.4f}')
    
    if scores.mean() > best_score:
        best_score = scores.mean()
        best_model = rf

print(f'\n‚úÖ Melhor CV: {best_score:.4f}')


Config 1: CV = 0.7679
Config 2: CV = 0.7648
Config 3: CV = 0.7633

‚úÖ Melhor CV: 0.7679


In [13]:
# ============================================================================
# 5. PREDI√á√ÉO E SUBMISS√ÉO
# ============================================================================

best_model.fit(X_train_rfecv, y_train)
y_pred = best_model.predict(X_test_rfecv)

submission = sample.copy()
submission['labels'] = y_pred
submission.to_csv('submission_FINAL_RAPIDO.csv', index=False)

print(f'\n‚úÖ Arquivo: submission_FINAL_RAPIDO.csv')
print(f'Distribui√ß√£o: {pd.Series(y_pred).value_counts(normalize=True)[1]:.1%} sucesso')
print(f'\nüéØ EXPECTATIVA: 78.5-80%')

# An√°lise final
print(f'\n{"="*60}')
print(f'RESULTADO FINAL')
print(f'{"="*60}')
print(f'CV Score: {best_score:.4f}')
print(f'Features selecionadas: {rfecv.n_features_}')
print(f'Configura√ß√£o: {best_model.get_params()["n_estimators"]} √°rvores, max_depth={best_model.get_params()["max_depth"]}')

if best_score > 0.79:
    print(f'\nüéâ EXCELENTE! CV {best_score:.4f} indica potencial para 80%+ no Kaggle!')
elif best_score > 0.78:
    print(f'\n‚úÖ BOM! CV {best_score:.4f} - pr√≥ximo de 80%!')
else:
    print(f'\n‚ö†Ô∏è CV {best_score:.4f} - pode precisar de mais ajustes')



‚úÖ Arquivo: submission_FINAL_RAPIDO.csv
Distribui√ß√£o: 64.3% sucesso

üéØ EXPECTATIVA: 78.5-80%

RESULTADO FINAL
CV Score: 0.7679
Features selecionadas: 26
Configura√ß√£o: 400 √°rvores, max_depth=16

‚ö†Ô∏è CV 0.7679 - pode precisar de mais ajustes


In [14]:
# ============================================================================
# 6. TESTE DE THRESHOLD (se necess√°rio)
# ============================================================================

if best_score < 0.80:
    print("\nüîß Testando otimiza√ß√£o de threshold...")
    
    # Obter probabilidades
    y_proba = best_model.predict_proba(X_train_rfecv)[:, 1]
    
    # Testar diferentes thresholds
    thresholds = [0.4, 0.42, 0.45, 0.47, 0.5]
    best_thresh = 0.5
    best_cv = best_score
    
    for thresh in thresholds:
        y_pred_thresh = (y_proba >= thresh).astype(int)
        
        # CV com threshold
        cv_thresh = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)
        scores = []
        
        for train_idx, val_idx in cv_thresh.split(X_train_rfecv, y_train):
            X_tr, X_val = X_train_rfecv[train_idx], X_train_rfecv[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            rf_temp = RandomForestClassifier(
                **best_model.get_params()
            )
            rf_temp.fit(X_tr, y_tr)
            y_proba_val = rf_temp.predict_proba(X_val)[:, 1]
            y_pred_val = (y_proba_val >= thresh).astype(int)
            
            from sklearn.metrics import accuracy_score
            scores.append(accuracy_score(y_val, y_pred_val))
        
        cv_score_thresh = np.mean(scores)
        print(f'Threshold {thresh}: CV = {cv_score_thresh:.4f}')
        
        if cv_score_thresh > best_cv:
            best_cv = cv_score_thresh
            best_thresh = thresh
    
    if best_thresh != 0.5:
        print(f'\n‚úÖ Melhor threshold: {best_thresh} (CV: {best_cv:.4f})')
        
        # Aplicar threshold otimizado
        y_proba_test = best_model.predict_proba(X_test_rfecv)[:, 1]
        y_pred_optimized = (y_proba_test >= best_thresh).astype(int)
        
        submission_optimized = sample.copy()
        submission_optimized['labels'] = y_pred_optimized
        submission_optimized.to_csv('submission_FINAL_THRESHOLD.csv', index=False)
        
        success_rate_opt = pd.Series(y_pred_optimized).value_counts(normalize=True)[1]
        print(f'‚úÖ Arquivo otimizado: submission_FINAL_THRESHOLD.csv')
        print(f'Distribui√ß√£o: Sucesso={success_rate_opt:.1%}')
    else:
        print(f'\n‚ö†Ô∏è Threshold padr√£o (0.5) j√° √© o melhor')
else:
    print("‚úÖ CV j√° est√° acima de 80% - n√£o precisa de threshold!")

print(f'\nüéØ RESUMO FINAL:')
print(f'CV Score: {best_score:.4f}')
print(f'Features: {rfecv.n_features_}')
print(f'Arquivos gerados: submission_FINAL_RAPIDO.csv')
if best_score < 0.80:
    print(f'                 submission_FINAL_THRESHOLD.csv (se threshold melhorou)')



üîß Testando otimiza√ß√£o de threshold...
Threshold 0.4: CV = 0.7833
Threshold 0.42: CV = 0.7833
Threshold 0.45: CV = 0.7787
Threshold 0.47: CV = 0.7741
Threshold 0.5: CV = 0.7679

‚úÖ Melhor threshold: 0.42 (CV: 0.7833)
‚úÖ Arquivo otimizado: submission_FINAL_THRESHOLD.csv
Distribui√ß√£o: Sucesso=73.3%

üéØ RESUMO FINAL:
CV Score: 0.7679
Features: 26
Arquivos gerados: submission_FINAL_RAPIDO.csv
                 submission_FINAL_THRESHOLD.csv (se threshold melhorou)
