In [2]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [6]:
df = pd.read_parquet("dados_clones.parquet")


### `Amostra para simular o DF do Brunão. populacao_df = seguro_202408`

In [7]:
# Seleção de uma amostra do dataframe original
sample_df, remaining_df = train_test_split(df, test_size=0.9, random_state=42)

# Definição das features e target

In [9]:
features = [
    "Estatura(cm)",
    "Massa(em kilos)",
    "Distância Ombro a ombro",
    "Tamanho do crânio",
    "Tamanho dos pés",
]

In [10]:
cat_features = ["Distância Ombro a ombro", "Tamanho do crânio", "Tamanho dos pés"]
num_features = ["Estatura(cm)", "Massa(em kilos)"]

In [26]:
X_sample = sample_df[features]
y_sample = sample_df["Status "] ##'predict'


# Pipeline

In [27]:
# Construção do pipeline para pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), num_features),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder())
        ]), cat_features),
    ]
)

In [28]:
# Definição do modelo e dos parâmetros para GridSearchCV
model = RandomForestClassifier(random_state=42, min_samples_leaf=25)

In [29]:
params = {
    "min_samples_leaf": [10, 25, 50, 75, 100],
    "n_estimators": [100, 200, 500, 1000],
    "criterion": ['gini', 'entropy'],
    "max_depth": [5, 8, 10, 12, 15]
}

In [30]:
grid_search = GridSearchCV(
    model,
    param_grid=params,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=3
)


In [31]:
# Construção do pipeline completo (pré-processamento + modelo com GridSearchCV)
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("grid_search", grid_search)
])

In [32]:
pipeline

# Treinamento

In [33]:
# Treinamento do modelo com a amostra
pipeline.fit(X_sample, y_sample)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


In [34]:
# Salvando o pipeline com o modelo treinado usando pickle
with open("pipeline_rf_gridsearch.pkl", "wb") as f:
    pickle.dump(pipeline, f)

# Fazendo a predição na base do Bruno

In [3]:
# Carregamento do pipeline e modelo salvo
with open("pipeline_rf_gridsearch.pkl", "rb") as f:
    loaded_pipeline = pickle.load(f)

In [11]:
# Aplicação do pipeline na base original
X_remaining = remaining_df[features]
X_transformed = loaded_pipeline.named_steps["preprocessor"].transform(X_remaining)

In [12]:
# Fazer predições na base completa usando o modelo carregado
y_pred = loaded_pipeline.named_steps["grid_search"].predict(X_transformed)

In [13]:
df_predictions = remaining_df.copy()
df_predictions['Predicted_Status'] = y_pred

In [14]:
# Exibir as primeiras linhas do DataFrame com as previsões
print(df_predictions[['Status ', 'Predicted_Status']].head())

# Opcional: salvar as previsões em um arquivo parquet ou CSV
df_predictions.to_parquet("../data/predicoes.parquet")
# ou
#df_predictions.to_csv("../data/predicoes.csv", index=False)

       Status  Predicted_Status
683386    Apto             Apto
117228    Apto             Apto
107884    Apto             Apto
292389    Apto             Apto
102954    Apto             Apto


In [15]:
df_predictions

Unnamed: 0,p2o_master_id,Massa(em kilos),General Jedi encarregado,Estatura(cm),Distância Ombro a ombro,Tamanho do crânio,Tamanho dos pés,Tempo de existência(em meses),Status,Predicted_Status
683386,6279538,83.67,Shaak Ti,181.06,Tipo 3,Tipo 5,Tipo 5,56.6,Apto,Apto
117228,7730196,83.54,Shaak Ti,180.55,Tipo 3,Tipo 5,Tipo 2,16.9,Apto,Apto
107884,7781053,84.05,Aayla Secura,180.45,Tipo 4,Tipo 2,Tipo 2,11.2,Apto,Apto
292389,5656129,84.13,Obi-Wan Kenobi,180.35,Tipo 3,Tipo 1,Tipo 4,13.8,Apto,Apto
102954,7723658,83.80,Mace Windu,180.20,Tipo 2,Tipo 3,Tipo 2,8.1,Apto,Apto
...,...,...,...,...,...,...,...,...,...,...
475296,9049723,83.23,Mace Windu,180.20,Tipo 1,Tipo 2,Tipo 2,25.9,Apto,Apto
425863,5716357,83.52,Aayla Secura,180.06,Tipo 4,Tipo 1,Tipo 2,3.7,Apto,Apto
581456,7290923,83.05,Yoda,180.08,Tipo 1,Tipo 5,Tipo 5,51.5,Apto,Apto
725967,9620657,83.36,Yoda,180.60,Tipo 2,Tipo 3,Tipo 3,48.5,Apto,Apto
