<a href="https://colab.research.google.com/github/mariabandeira/AdministracaoPublica/blob/main/CadUnico/KNN/TrainningPB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score
from imblearn.under_sampling import RandomUnderSampler

In [11]:
treated_data_PB = pd.read_csv('/content/treated_cadUnicoPB.csv', index_col=0)

In [12]:
treated_data_PB.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297045 entries, 53308 to 3954431
Data columns (total 21 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   cd_ibge                        297045 non-null  int64  
 1   estrato                        297045 non-null  int64  
 2   classf                         297045 non-null  int64  
 3   id_familia                     297045 non-null  float64
 4   vlr_renda_media_fam            297045 non-null  float64
 5   cod_local_domic_fam            297045 non-null  float64
 6   qtd_comodos_domic_fam          297045 non-null  float64
 7   qtd_comodos_dormitorio_fam     297045 non-null  float64
 8   cod_material_piso_fam          297045 non-null  float64
 9   cod_material_domic_fam         297045 non-null  float64
 10  cod_agua_canalizada_fam        297045 non-null  float64
 11  cod_abaste_agua_domic_fam      297045 non-null  float64
 12  cod_escoa_sanitario_domic_fam 

### Treinando o modelo KNN

In [13]:
# analisando a distribuição da variável alvo (marc_pbf)
target_distribution = treated_data_PB['marc_pbf'].value_counts(normalize=True) * 100

print(round(target_distribution))

marc_pbf
1    65.0
0    35.0
Name: proportion, dtype: float64


In [14]:
# separar X (features) e y (target)
X = treated_data_PB.drop('marc_pbf', axis=1)
y = treated_data_PB['marc_pbf']

# treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# criando uma instância do RandomUnderSampling
rus = RandomUnderSampler(random_state=42, sampling_strategy = 'majority')

# balanceando os dados
X_resampled, y_X_resampled = rus.fit_resample(X_train, y_train)

print(y_X_resampled.value_counts())

marc_pbf
0    73668
1    73668
Name: count, dtype: int64


In [15]:
# aplicar padronização nas features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

print("Primeiros dados escalados (padronizados):")
print(X_train_scaled[:5])

Primeiros dados escalados (padronizados):
[[-0.89036632  0.60625059  0.82916683  0.20666513  0.86519823 -0.47555747
   0.97840329  1.75644702  1.45712256 -0.19528418 -0.43965204 -0.48510741
  -0.86916817 -0.45368299 -0.20480803 -0.87153126  0.05269602 -0.29181643
   0.23360459  1.28839774]
 [ 1.55552458 -1.64948294  0.82916683 -1.10570805  2.10519503  2.10279527
   0.97840329 -1.19226575 -0.64144158 -0.19528418 -0.43965204  0.64990157
   0.09462198 -0.45368299 -0.20480803 -0.87153126  0.05269602 -0.29181643
  -1.20174994  1.28807481]
 [-0.57593311  0.60625059  0.82916683 -0.49117501  0.66569379 -0.47555747
   0.97840329  0.28209064 -0.64144158 -0.19528418 -0.43965204  1.78491056
   1.05841214 -0.45368299 -0.20480803 -0.87153126  0.05269602  3.16393683
  -0.48407267  1.28807481]
 [-0.03684062  0.60625059 -1.82748477  0.74125223 -0.09549236 -0.47555747
   0.19354987  0.28209064 -0.64144158 -0.19528418 -0.43965204 -0.48510741
   1.05841214 -0.45368299 -0.20480803  1.18886361  0.05269602 -

In [None]:
k_values = list(range(1, 31)) # testando valores de 1 a 30

param_grid = {'n_neighbors': k_values}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_scaled, y_X_resampled)

best_k = grid_search.best_params_['n_neighbors']
print("Melhor valor de k encontrado:", best_k)

In [None]:
# Treine o modelo final com o melhor valor de k
final_model = KNeighborsClassifier(n_neighbors=best_k)
final_model.fit(X_train_scaled, y_X_resampled)

In [None]:
# Faça previsões nos dados de teste
y_pred = final_model.predict(X_test_scaled)

# Imprima o relatório de classificação
print(classification_report(y_test, y_pred))

### Selecionando features com SelectKBests

In [None]:
selector = SelectKBest(score_func=f_classif, k=5)
X_train_selected = selector.fit_transform(X_resampled, y_X_resampled)
X_test_selected = selector.transform(X_test)

selected_columns = X_train.columns[selector.get_support()]
print("Características selecionadas:", selected_columns)

Características selecionadas: Index(['vlr_renda_media_fam', 'cod_material_piso_fam',
       'cod_agua_canalizada_fam', 'cod_calcamento_domic_fam', 'qtde_pessoas'],
      dtype='object')
