In [1]:
# =========================================================
# 1. IMPORTAR BIBLIOTECAS
# =========================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

pd.set_option("display.max_columns", 100)

# =========================================================
# 2. CARREGAR DADOS
# =========================================================
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

print("Shape train:", train_df.shape)
print("Shape test:", test_df.shape)
print("\nPrimeiras linhas do train:")
display(train_df.head())

# =========================================================
# 3. PREPARAR DADOS (Limpeza simples + Outliers + Encoding)
# =========================================================

# (a) Tratar valores ausentes simples (se houver)
na_counts = train_df.isnull().sum()
print("\nValores nulos por coluna (train):")
print(na_counts[na_counts > 0])

# Exemplo simples: preencher numéricos com mediana, categóricos com 'missing'
numeric_cols = train_df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = train_df.select_dtypes(include=['object']).columns.tolist()

# Evitar alterar target
if 'labels' in numeric_cols:
    numeric_cols.remove('labels')

for col in numeric_cols:
    train_df[col].fillna(train_df[col].median(), inplace=True)
    if col in test_df.columns:
        test_df[col].fillna(train_df[col].median(), inplace=True)

for col in cat_cols:
    train_df[col].fillna("missing", inplace=True)
    if col in test_df.columns:
        test_df[col].fillna("missing", inplace=True)

# (b) Capping de outliers (IQR) somente em colunas numéricas (exceto id / target)
def cap_outliers(df, cols):
    for c in cols:
        if c in ['id','labels']: 
            continue
        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        IQR = Q3 - Q1
        low  = Q1 - 1.5*IQR
        high = Q3 + 1.5*IQR
        df[c] = df[c].clip(low, high)
    return df

train_df = cap_outliers(train_df, numeric_cols)
test_df  = cap_outliers(test_df, [c for c in numeric_cols if c in test_df.columns])

# (c) One-Hot Encoding consistente (juntar antes, depois separar)
train_df['_is_train'] = 1
test_df['_is_train']  = 0
test_df['labels'] = np.nan   # placeholder para permitir concat

full = pd.concat([train_df, test_df], ignore_index=True)

# Escolher colunas categóricas reais (ex: category_code + flags se existirem)
if 'category_code' in full.columns:
    full['category_code'] = full['category_code'].astype(str)

categorical_for_dummies = []
if 'category_code' in full.columns:
    categorical_for_dummies.append('category_code')

# Criar dummies
full = pd.get_dummies(full, columns=categorical_for_dummies, drop_first=True)

# Separar novamente
train_processed = full[full['_is_train'] == 1].drop(columns=['_is_train'])
test_processed  = full[full['_is_train'] == 0].drop(columns=['_is_train','labels'])

# Separar features / target
y = train_processed['labels'].astype(int)
X = train_processed.drop(columns=['labels','id'])
test_ids = test_processed['id']
X_test_final = test_processed.drop(columns=['id'])

print("\nShapes finais:")
print("X:", X.shape, "y:", y.shape, "X_test:", X_test_final.shape)

# =========================================================
# 4. TREINAR MODELOS (Baseline + Cross-Validation + Tuning)
# =========================================================

def eval_cv(model, X, y, folds=5):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    accs, precs, recs, f1s = [], [], [], []
    for tr, va in skf.split(X, y):
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]
        model.fit(X_tr, y_tr)
        preds = model.predict(X_va)
        accs.append(accuracy_score(y_va, preds))
        precs.append(precision_score(y_va, preds))
        recs.append(recall_score(y_va, preds))
        f1s.append(f1_score(y_va, preds))
    return {
        'acc_mean': np.mean(accs),
        'prec_mean': np.mean(precs),
        'rec_mean': np.mean(recs),
        'f1_mean': np.mean(f1s)
    }

print("\n--- Baseline Random Forest ---")
rf_baseline = RandomForestClassifier(random_state=42, n_estimators=200)
rf_metrics = eval_cv(rf_baseline, X, y)
print(rf_metrics)

# Pequeno tuning (grade enxuta para rapidez)
param_grid_rf = {
    'n_estimators':[200,400],
    'max_depth':[None,15],
    'min_samples_split':[2,5],
    'min_samples_leaf':[1,2]
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_rf.fit(X, y)
print("\nMelhor RF:", grid_rf.best_params_, "Acurácia CV:", grid_rf.best_score_)

# Gradient Boosting (baseline + leve tuning)
print("\n--- Gradient Boosting (Baseline) ---")
gb_baseline = GradientBoostingClassifier(random_state=42)
gb_metrics = eval_cv(gb_baseline, X, y)
print(gb_metrics)

param_grid_gb = {
    'n_estimators':[150,200],
    'learning_rate':[0.05,0.1],
    'max_depth':[3],
    'min_samples_split':[2,4],
    'min_samples_leaf':[1,2]
}
grid_gb = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid=param_grid_gb,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_gb.fit(X, y)
print("\nMelhor GB:", grid_gb.best_params_, "Acurácia CV:", grid_gb.best_score_)

# Escolher melhor modelo entre RF e GB
if grid_rf.best_score_ >= grid_gb.best_score_:
    best_model = grid_rf.best_estimator_
    best_name  = "RandomForest"
    best_score = grid_rf.best_score_
else:
    best_model = grid_gb.best_estimator_
    best_name  = "GradientBoosting"
    best_score = grid_gb.best_score_

print(f"\nModelo escolhido: {best_name} | Acurácia CV: {best_score:.4f}")

# =========================================================
# 5. AVALIAÇÃO HOLD-OUT (opcional para relatório)
# =========================================================
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
best_model.fit(X_tr, y_tr)
val_preds = best_model.predict(X_val)

print("\n--- Métricas (Hold-out 20%) ---")
print("Acurácia:", accuracy_score(y_val, val_preds))
print("Precisão:", precision_score(y_val, val_preds))
print("Recall:", recall_score(y_val, val_preds))
print("F1:", f1_score(y_val, val_preds))
print("\nClassification Report:\n", classification_report(y_val, val_preds))

# =========================================================
# 6. TREINAR MODELO FINAL E GERAR SUBMISSÃO
# =========================================================
best_model.fit(X, y)
test_pred = best_model.predict(X_test_final)

submission = pd.DataFrame({
    'id': test_ids,
    'labels': test_pred
})

submission.to_csv("submission.csv", index=False)
print("\nArquivo submission.csv gerado com sucesso.")

Shape train: (646, 33)
Shape test: (277, 32)

Primeiras linhas do train:


Unnamed: 0,id,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,is_CA,is_NY,is_MA,is_TX,is_otherstate,category_code,is_software,is_web,is_mobile,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,labels
0,719,10.42,13.09,8.98,12.72,4,3,4087500,3,1,0,0,0,0,enterprise,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1.0,0
1,429,3.79,3.79,,,21,1,45000000,0,0,1,0,0,0,advertising,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1.0,1
2,178,0.71,2.28,1.95,2.28,5,2,5200000,2,1,0,0,0,0,photo_video,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1.0,0
3,197,3.0,5.0,9.62,10.39,16,2,14500000,2,0,0,1,0,0,advertising,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,2.0,1
4,444,0.66,5.88,6.21,8.61,29,5,70000000,4,1,0,0,0,0,web,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,2.8,1



Valores nulos por coluna (train):
age_first_funding_year       35
age_last_funding_year         9
age_first_milestone_year    138
age_last_milestone_year     111
dtype: int64

Shapes finais:
X: (646, 64) y: (646,) X_test: (277, 64)

--- Baseline Random Forest ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(train_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whic

{'acc_mean': np.float64(0.7863446630888491), 'prec_mean': np.float64(0.7909321422327965), 'rec_mean': np.float64(0.9113597246127367), 'f1_mean': np.float64(0.8465264844983361)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits

Melhor RF: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200} Acurácia CV: 0.7971615980918307

--- Gradient Boosting (Baseline) ---
{'acc_mean': np.float64(0.7909958258795469), 'prec_mean': np.float64(0.8084934904462596), 'rec_mean': np.float64(0.8873780837636259), 'f1_mean': np.float64(0.8456548653007916)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits

Melhor GB: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200} Acurácia CV: 0.7940727489564698

Modelo escolhido: RandomForest | Acurácia CV: 0.7972

--- Métricas (Hold-out 20%) ---
Acurácia: 0.7692307692307693
Precisão: 0.78125
Recall: 0.8928571428571429
F1: 0.8333333333333334

Classification Rep