In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [3]:
data = pd.read_csv('../data/water_potability_oversampled.csv').dropna()

X = data.drop('Potability', axis=1)
y = data['Potability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
svm_classifier = SVC(kernel='linear')

svm_classifier.fit(X_train, y_train)

predictions = svm_classifier.predict(X_test)


accuracy = accuracy_score(y_test, predictions)
print(f'Acurácia do modelo: {accuracy * 100:.2f}%')

precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

print('\nClassification Report:\n', classification_report(y_test, predictions))

Acurácia do modelo: 46.25%
Precision: 0.49
Recall: 0.66
F1-score: 0.56

Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.25      0.31       232
           1       0.49      0.66      0.56       248

    accuracy                           0.46       480
   macro avg       0.45      0.46      0.43       480
weighted avg       0.45      0.46      0.44       480



In [5]:
# Balancear as classes usando oversampling (SMOTE)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Ajustar parâmetros usando GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_resampled, y_resampled)

# Exibir os melhores parâmetros encontrados pelo GridSearch
print("Melhores parâmetros encontrados:", grid_search.best_params_)

# Criar o classificador SVM com os melhores parâmetros
best_svm_classifier = grid_search.best_estimator_

# Treinar o modelo com o conjunto de dados resampleado
best_svm_classifier.fit(X_resampled, y_resampled)

# Fazer previsões no conjunto de teste
predictions = best_svm_classifier.predict(X_test)

# Calcular e exibir precision, recall e f1-score
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f'Acurácia do modelo: {accuracy * 100:.2f}%')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

# Mostrar a tabela de classificação
print('\nClassification Report:\n', classification_report(y_test, predictions))