<a href="https://colab.research.google.com/github/mariabandeira/AdministracaoPublica/blob/main/CadUnico/SVM/TrainningPB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Treinando para dados da PB

In [None]:
import pandas as pd
import numpy as np

In [None]:
cadUnicoPB = pd.read_csv('https://raw.githubusercontent.com/mariabandeira/AdministracaoPublica/refs/heads/main/CadUnico/BasesPB/cadunico_pb_cleaned.csv', sep=',', encoding='utf-8')

In [None]:
cadUnicoPB.dropna(inplace=True)
cadUnicoPB.drop_duplicates(inplace=True)

### Treinando modelo SVM

In [None]:
# analisando a distribuição da variável alvo (marc_pbf)
target_distribution = cadUnicoPB['marc_pbf'].value_counts(normalize=True) * 100

print(round(target_distribution))

marc_pbf
1    65.0
0    35.0
Name: proportion, dtype: float64


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# separar X (features) e y (target)
X = cadUnicoPB.drop('marc_pbf', axis=1)
y = cadUnicoPB['marc_pbf']

# treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# aplicando SMOTE para balancear o conjunto de treino
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(y_train_balanced.value_counts())

marc_pbf
1    130058
0    130058
Name: count, dtype: int64


In [None]:
from sklearn.preprocessing import StandardScaler

# aplicar padronização nas features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

print("Primeiros dados escalados (padronizados):")
print(X_train_scaled[:5])


Primeiros dados escalados (padronizados):
[[-0.61648453 -0.57338093  0.61535985  0.884187   -0.52550414 -0.71789997
  -0.47555839  1.84922055  1.83091414 -0.66749388 -0.19983319 -0.44296607
  -0.48624828  1.10209049 -0.45432654 -0.21784688  1.23127202  0.05356081
   3.32044088 -1.17731135  1.69447035]
 [-1.0974884  -1.53203455 -1.62506541  0.884187   -1.10826096 -0.70134691
  -0.47555839  0.21565604  0.29883675 -0.66749388 -0.19983319  2.38621136
   1.88250198 -0.88918577 -0.45432654 -0.21784688 -0.90166122  0.05356081
  -0.29100429  1.01143881 -0.59637768]
 [-0.2830315  -1.66999661  0.61535985  0.884187   -0.28499568 -0.69141507
  -0.47555839 -1.41790847 -1.23324064 -0.66749388 -0.19983319  2.38621136
   3.06687711 -0.88918577 -0.45432654  3.19727694  1.23127202  0.05356081
  -0.29100429  0.28185542 -0.59637768]
 [-1.51184416 -1.39681345 -1.62506541  0.884187   -1.5277283  -0.74107426
   2.22492746  1.03243829  0.29883675  1.51604203 -0.19983319 -0.44296607
  -0.48624828  0.10645236  

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Inicializando o modelo SVM
svm_model = SVC(kernel='linear', random_state=42, C=1.0, gamma='scale')

# Treinando o modelo com os dados de treino escalados
svm_model.fit(X_train_scaled, y_train_balanced)

In [None]:
# Previsões no conjunto de teste
y_pred = svm_model.predict(X_test_scaled)

In [None]:
# Acurácia
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia: {accuracy:.2f}")

# Matriz de Confusão
conf_matrix = confusion_matrix(y_test, y_pred)
print("Matriz de Confusão:")
print(conf_matrix)

# Relatório de Classificação
class_report = classification_report(y_test, y_pred, target_names=['Classe 0', 'Classe 1'])
print("Relatório de Classificação:")
print(class_report)

Acurácia: 0.90
Matriz de Confusão:
[[24260  5986]
 [ 2486 53396]]
Relatório de Classificação:
              precision    recall  f1-score   support

    Classe 0       0.91      0.80      0.85     30246
    Classe 1       0.90      0.96      0.93     55882

    accuracy                           0.90     86128
   macro avg       0.90      0.88      0.89     86128
weighted avg       0.90      0.90      0.90     86128



### Extraindo as melhores features

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Selecionar as 5 melhores features (ajuste o valor de k conforme necessário)
k = 5
selector = SelectKBest(score_func=f_classif, k=k)

In [None]:
# Ajustar o selector ao conjunto de treino
X_train_selected = selector.fit_transform(X_train_scaled, y_train_balanced)
X_test_selected = selector.transform(X_test_scaled)

# Mostrar as melhores features
selected_features = selector.get_support(indices=True)
print(f"As {k} melhores features (índices):", selected_features)

# Exibir os nomes das melhores features
feature_names_array = np.array(X.columns)  # Convertendo para array
best_feature_names = feature_names_array[selected_features]
print(f"As {k} melhores features (nomes):", best_feature_names)

As 5 melhores features (índices): [ 5  9 13 16 19]
As 5 melhores features (nomes): ['vlr_renda_media_fam' 'cod_material_piso_fam'
 'cod_escoa_sanitario_domic_fam' 'cod_calcamento_domic_fam' 'qtde_pessoas']


In [None]:
# Treinando o modelo SVM com as melhores features
svm_model = SVC(kernel='rbf', random_state=42, C=1.0, gamma='scale')
svm_model.fit(X_train_selected, y_train_balanced)

In [None]:
# Previsões com as melhores features
y_pred_selected = svm_model.predict(X_test_selected)

# Avaliar o modelo
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred_selected)
print(f"Acurácia com as {k} melhores features: {accuracy:.2f}")

conf_matrix = confusion_matrix(y_test, y_pred_selected)
print("Matriz de Confusão:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred_selected, target_names=['Classe 0', 'Classe 1'])
print("Relatório de Classificação:")
print(class_report)

Acurácia com as 5 melhores features: 0.90
Matriz de Confusão:
[[23680  6566]
 [ 1695 54187]]
Relatório de Classificação:
              precision    recall  f1-score   support

    Classe 0       0.93      0.78      0.85     30246
    Classe 1       0.89      0.97      0.93     55882

    accuracy                           0.90     86128
   macro avg       0.91      0.88      0.89     86128
weighted avg       0.91      0.90      0.90     86128

