In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

In [3]:
# Load your dataset
data = pd.read_csv('datasets/fraudTrain.csv')

In [4]:
# Select relevant features and labels
X = data[['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']]
y = data['is_fraud']

In [5]:
print(X.head())

      amt  city_pop      lat      long  merch_lat  merch_long
0    4.97      3495  36.0788  -81.1781  36.011293  -82.048315
1  107.23       149  48.8878 -118.2105  49.159047 -118.186462
2  220.11      4154  42.1808 -112.2620  43.150704 -112.154481
3   45.00      1939  46.2306 -112.1138  47.034331 -112.561071
4   41.96        99  38.4207  -79.4629  38.674999  -78.632459


In [8]:
# Split the data into training and testing sets
X_train, X_raw_test, y_train, y_raw_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features to have zero mean and unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_raw_test = scaler.transform(X_raw_test)

In [9]:
# Create a simple feed-forward neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=16, activation='relu', input_dim=X_train.shape[1]),
    tf.keras.layers.Dense(units=8, activation='relu'),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [10]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_raw_test, y_raw_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1e492a84d10>

In [15]:
# Evaluate the model
y_pred = model.predict(X_raw_test)
y_pred = (y_pred > 0.5)  # Convert probabilities to binary predictions - if prob > 0.5 is fraud and vice versa



In [16]:
# Calculate confusion matrix and classification report
confusion = confusion_matrix(y_raw_test, y_pred)
print("Confusion Matrix:")
print(confusion)

Confusion Matrix:
[[257594    221]
 [  1227    293]]


In [17]:
report = classification_report(y_raw_test, y_pred)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.57      0.19      0.29      1520

    accuracy                           0.99    259335
   macro avg       0.78      0.60      0.64    259335
weighted avg       0.99      0.99      0.99    259335



## COMPROBACION DEL MODELO 

In [41]:
# Load the test data
test_data = pd.read_csv('datasets/fraudTest.csv')

In [42]:
# Preprocess the test data (select features and standardize)
X_test = test_data[['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']]
y_test = test_data['is_fraud']
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

In [43]:
# Use the trained model to make predictions on the test data
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)  # Apply the threshold



In [44]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(confusion)

Accuracy: 0.9959997768656461
Precision: 0.47393048128342247
Recall: 0.3305361305361305
F1 Score: 0.38945344685525957
Confusion Matrix:
[[552787    787]
 [  1436    709]]


In [45]:
# se crea una nueva columna donde se especifica si es fraude o no
test_data['predicted_fraud'] = (model.predict(X_test) > 0.5)



In [46]:
fraudulent_rows = test_data[test_data['predicted_fraud'] == 1]

In [6]:
len(fraudulent_rows)

NameError: name 'fraudulent_rows' is not defined

In [47]:
print(fraudulent_rows.head())


      Unnamed: 0 trans_date_trans_time            cc_num  \
1044        1044   2020-06-21 18:08:47  4302480582202074   
1695        1695   2020-06-21 22:09:41     4427805710168   
1784        1784   2020-06-21 22:38:55  4005676619255478   
1956        1956   2020-06-21 23:35:27  4005676619255478   
1968        1968   2020-06-21 23:40:26  4005676619255478   

                                 merchant      category      amt     first  \
1044     fraud_Towne, Greenholt and Koepp  shopping_net   981.92     David   
1695               fraud_Lebsack and Sons      misc_net   890.22  Michelle   
1784  fraud_Heathcote, Yost and Kertzmann  shopping_net  1077.69   William   
1956        fraud_Reichert, Rowe and Mraz  shopping_net   931.82   William   
1968    fraud_Kihn, Abernathy and Douglas  shopping_net   983.00   William   

           last gender                         street  ...      long city_pop  \
1044  Rodriguez      M               821 Solis Points  ...  -86.2492   128715   
1695  Ro