## Tecnologias Utilizadas

In [1]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

%matplotlib inline
warnings.filterwarnings('ignore')

Depois de analisar e tratar todos os dados, vamos passar para as etapas de modelação e avaliação. Assim, é necessário voltar a ler os dados do **dataset** tratado.

In [2]:
training_inputs = pd.read_csv('CreditCardTransactions/training_dataset.csv')
training_classes = pd.read_csv('CreditCardTransactions/training_classes.csv')
testing_inputs = pd.read_csv('CreditCardTransactions/testing_dataset.csv')
testing_classes = pd.read_csv('CreditCardTransactions/testing_classes.csv')

## Modelação

Em primeiro lugar, desenvolvemos um modelo de classificação baseado numa **Árvore de Decisão**. Tendo em conta que a variável-alvo é categórica, pelo que este é um problema de classificação e o objetivo do modelo é classificar registos, tomando decisões a partir dos atributos de cada cliente, este algoritmo considera-se adequado para o efeito.

Em segundo lugar, implementamos uma **Rede Neuronal**, que é um modelo de *deep learning* com capacidade para aprender a partir dos dados, sendo capaz de identificar padrões complexos e de fazer previsões precisas.

Para além disto, construímos um modelo de classificação assente numa ***Support Vector Machine*** (SVM). Esta abordagem tenta encontrar o hiperplano que melhor separa os registos das diferentes classes, maximizando a margem entre esse hiperplano e os registos das duas classes.

Por último, apresentamos um ***Random Forest Classifier***, que, sendo um modelo de *ensembling*, combina várias árvores de decisão para classificar melhor os dados.

Os modelos devem ser treinados com um conjunto de dados de treino e testados com um conjunto de dados de teste. O conjunto de teste vai corresponder a 25% do *dataset*.

Para a construção de alguns modelos, como a rede neuronal, o *k-nearest neighbors* e o *support vector machine*, é necessário normalizar os dados de maneira a que tenham média 0 e variância 1, aplicando este processo ao conjunto de treino e ao conjunto de teste.

In [3]:
scaler = StandardScaler()

training_inputs_scaled = scaler.fit_transform(training_inputs)

testing_inputs_scaled = scaler.transform(testing_inputs)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- unique_merchants_count


In [None]:
smote = SMOTE(random_state=42)
training_inputs, training_classes = smote.fit_resample(training_inputs, training_classes)

scaler = StandardScaler()

training_inputs_scaled = scaler.fit_transform(training_inputs)

testing_inputs_scaled = scaler.transform(testing_inputs)

In [None]:
smote = SMOTE(random_state=42)
cross_validation = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Definir o modelo Decision Tree e o Pipeline
decision_tree = DecisionTreeClassifier(random_state=42)

pipeline = ImbPipeline([
    ('smote', smote),                # Aplicar SMOTE
    ('model', decision_tree)         # Modelo a ser treinado
])

# Definir os hiperparâmetros para o modelo dentro do pipeline
decision_tree_parameter_grid = {
    'model__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],  # Prefixo "model__" por causa do pipeline
    'model__max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

# Configurar o GridSearchCV
decision_tree_grid_search = GridSearchCV(
    pipeline,
    param_grid=decision_tree_parameter_grid,
    cv=cross_validation,
    n_jobs=-1,
    scoring='auc'  # Alterar para outra métrica, se necessário
)

# Ajustar o GridSearchCV
decision_tree_grid_search.fit(training_inputs, training_classes)

# Exibir os resultados
print('Decision Tree Best Score: {:.2f}%'.format(decision_tree_grid_search.best_score_ * 100))
print('Decision Tree Best Parameters:', decision_tree_grid_search.best_params_)


In [None]:
best_decision_tree_params = decision_tree_grid_search.best_params_
best_decision_tree = DecisionTreeClassifier(max_depth = 10, max_features = 10, random_state = 42)
best_decision_tree.fit(training_inputs, training_classes)
best_decision_tree.score(testing_inputs, testing_classes)

In [None]:
neural_network = MLPClassifier(max_iter = 1000, random_state = 13)


neural_network_parameter_grid = {
    'hidden_layer_sizes': [(5, ), (6, ), (7, ), (5, 4), (5, 5), (6, 4), (6, 5), (6, 6)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
}

neural_network_grid_search = GridSearchCV(neural_network, neural_network_parameter_grid, n_jobs = -1)

neural_network_grid_search.fit(training_inputs_scaled, training_classes)

print('Neural Network Best Score: {:.2f}%'.format(neural_network_grid_search.best_score_ * 100))
print('Neural Network Best Parameters:', neural_network_grid_search.best_params_)

In [None]:
best_neural_network_params = neural_network_grid_search.best_params_
best_neural_network = MLPClassifier(hidden_layer_sizes = best_neural_network_params['hidden_layer_sizes'], activation = best_neural_network_params['activation'], solver = best_neural_network_params['solver'], random_state = 13)
best_neural_network.fit(training_inputs_scaled, training_classes)
best_neural_network.score(testing_inputs_scaled, testing_classes)

In [None]:
svm = SVC(random_state = 13)

svm_parameter_grid = {
    'C': [0.9, 1.0, 1.1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

svm_grid_search = GridSearchCV(svm, svm_parameter_grid, cv = cross_validation, n_jobs = -1)

svm_grid_search.fit(training_inputs_scaled, training_classes)

print("Support Vector Machine Best Score: {:.2f}%".format(svm_grid_search.best_score_ * 100))
print("Support Vector Machine Best Parameters:", svm_grid_search.best_params_)

In [None]:
best_svm_params = svm_grid_search.best_params_
best_svm = SVC(C = best_svm_params['C'], kernel = best_svm_params['kernel'], gamma = best_svm_params['gamma'], random_state = 13)
best_svm.fit(training_inputs_scaled, training_classes)
best_svm.score(testing_inputs_scaled, testing_classes)

In [None]:
random_forest = RandomForestClassifier(random_state = 13)

random_forest_parameter_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [30, 40],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'max_features': ['log2', 'sqrt']
}

random_forest_grid_search = GridSearchCV(random_forest, random_forest_parameter_grid, cv = cross_validation, n_jobs = -1)

random_forest_grid_search.fit(training_inputs, training_classes)

print('Random Forest Best Score: {:.2f}%'.format(random_forest_grid_search.best_score_ * 100))
print('Random Forest Best Parameters:', random_forest_grid_search.best_params_)

In [None]:
best_random_forest_params = random_forest_grid_search.best_params_
best_random_forest = RandomForestClassifier(n_estimators = best_random_forest_params['n_estimators'], max_depth = best_random_forest_params['max_depth'], min_samples_split = best_random_forest_params['min_samples_split'], min_samples_leaf = best_random_forest_params['min_samples_leaf'], max_features = best_random_forest_params['max_features'])
best_random_forest.fit(training_inputs, training_classes)
best_random_forest.score(testing_inputs, testing_classes)

In [None]:
import xgboost as xgb
# Inicializar o XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=13,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Definir o espaço de hiperparâmetros para XGBoost
xgb_parameter_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'gamma': [0, 1, 5],
}

# Configurar o GridSearchCV para XGBoost
xgb_grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=xgb_parameter_grid,
    cv=cross_validation,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Executar o Grid Search
xgb_grid_search.fit(training_inputs_scaled, training_classes)

# Exibir os melhores parâmetros e a melhor pontuação
print('XGBoost Best Score: {:.2f}%'.format(xgb_grid_search.best_score_ * 100))
print('XGBoost Best Parameters:', xgb_grid_search.best_params_)

# Utilizar o melhor modelo para prever no conjunto de teste
best_xgb_model = xgb_grid_search.best_estimator_
y_pred = best_xgb_model.predict(testing_inputs_scaled)

# Avaliar a acurácia no conjunto de teste
test_accuracy = accuracy_score(testing_classes, y_pred)
print('XGBoost Test Accuracy: {:.2f}%'.format(test_accuracy * 100))


In [None]:
best_decision_tree_pred = best_decision_tree.predict(testing_inputs)

print(classification_report(testing_classes, best_decision_tree_pred))

best_decision_tree_confusion_matrix = confusion_matrix(testing_classes, best_decision_tree_pred)
ConfusionMatrixDisplay(best_decision_tree_confusion_matrix).plot()

In [None]:
best_neural_network_pred = best_neural_network.predict(testing_inputs_scaled)

print(classification_report(testing_classes, best_neural_network_pred))

best_neural_network_confusion_matrix = confusion_matrix(testing_classes, best_neural_network_pred)
ConfusionMatrixDisplay(best_neural_network_confusion_matrix).plot()

In [None]:
best_svm_pred = best_svm.predict(testing_inputs_scaled)

print(classification_report(testing_classes, best_svm_pred))

best_svm_confusion_matrix = confusion_matrix(testing_classes, best_svm_pred)
ConfusionMatrixDisplay(best_svm_confusion_matrix).plot()

In [None]:
best_random_forest_pred = best_random_forest.predict(testing_inputs)

print(classification_report(testing_classes, best_random_forest_pred))

best_random_forest_confusion_matrix = confusion_matrix(testing_classes, best_random_forest_pred)
ConfusionMatrixDisplay(best_random_forest_confusion_matrix).plot()

In [None]:
xgb_confusion_matrix = confusion_matrix(testing_classes, y_pred)

# Exibir a Matriz de Confusão
disp = ConfusionMatrixDisplay(confusion_matrix=xgb_confusion_matrix, display_labels=best_xgb_model.classes_).plot()
