In [None]:
# 1. Import Libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import joblib
import json

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve)
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
# Visualization config
plt.style.use('ggplot')
sns.set_palette("husl")
%matplotlib inline

In [None]:
# 2. Load data
try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print("Dados carregados com sucesso!")
except FileNotFoundError:
    print("Arquivos não encontrados. Verifique os nomes dos arquivos.")

In [None]:
# 3. Data analysis
plt.figure(figsize=(10, 6))
sns.countplot(x='target', data=train_df)
plt.title('Distribuição da Variável Target')
plt.xlabel('Target')
plt.ylabel('Contagem')
plt.show()

In [None]:
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
correlation_matrix = train_df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Matriz de Correlação')
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
numeric_features = [col for col in numeric_cols if col != 'target'][:4]

for i, feature in enumerate(numeric_features):
    row, col = i // 2, i % 2
    sns.boxplot(x='target', y=feature, data=train_df, ax=axes[row, col])
    axes[row, col].set_title(f'{feature} vs Target')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
for i, col in enumerate(numeric_features, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(y=train_df[col])
    plt.title(f'Boxplot de {col}')

plt.tight_layout()
plt.show()

In [None]:
# 4. Preprocessing
X = train_df.drop('target', axis=1)
y = train_df['target']

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

print(f"Variáveis numéricas: {list(numeric_features)}")
print(f"Variáveis categóricas: {list(categorical_features)}")