In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

## Creating and training model

In [40]:
# Load your dataset
data = pd.read_csv('../../../datasets/fraudTrain.csv')
print(data.shape)
print(data.head())

(1296675, 23)
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suit

In [4]:
# Select relevant features and labels

columns_out = ['trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'city', 'job', 'dob', 'trans_num', 'unix_time']
data = data.drop(columns=columns_out)

X = data[['category', 'amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']]
X = pd.get_dummies(data, columns=['category'], prefix=['cat']) # one-hot encoding para 'category'
y = data['is_fraud']

In [5]:
print(len(data))
print(len(X))

1296675
1296675


In [42]:
#print(X.head())
print(X.shape)
print(X.head())

(1296675, 22)
   Unnamed: 0     amt      lat      long  city_pop  merch_lat  merch_long  \
0           0    4.97  36.0788  -81.1781      3495  36.011293  -82.048315   
1           1  107.23  48.8878 -118.2105       149  49.159047 -118.186462   
2           2  220.11  42.1808 -112.2620      4154  43.150704 -112.154481   
3           3   45.00  46.2306 -112.1138      1939  47.034331 -112.561071   
4           4   41.96  38.4207  -79.4629        99  38.674999  -78.632459   

   is_fraud  cat_entertainment  cat_food_dining  ...  cat_grocery_pos  \
0         0              False            False  ...            False   
1         0              False            False  ...             True   
2         0               True            False  ...            False   
3         0              False            False  ...            False   
4         0              False            False  ...            False   

   cat_health_fitness  cat_home  cat_kids_pets  cat_misc_net  cat_misc_pos  \
0     

In [7]:
# Split the data into training and testing sets
X_train, X_raw_test, y_train, y_raw_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features to have zero mean and unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_raw_test = scaler.transform(X_raw_test)

In [8]:
# Create a simple feed-forward neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=16, activation='relu', input_dim=X_train.shape[1]),
    tf.keras.layers.Dense(units=8, activation='relu'),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [9]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                368       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 513 (2.00 KB)
Trainable params: 513 (2.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_raw_test, y_raw_test))
model.save('detector.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [11]:
# Evaluate the model
y_pred = model.predict(X_raw_test)
y_pred = (y_pred > 0.5)  # Convert probabilities to binary predictions - if prob > 0.5 is fraud and vice versa



In [12]:
# Calculate confusion matrix and classification report
confusion = confusion_matrix(y_raw_test, y_pred)
print("Confusion Matrix:")
print(confusion)

Confusion Matrix:
[[257815      0]
 [     0   1520]]


In [13]:
report = classification_report(y_raw_test, y_pred)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       1.00      1.00      1.00      1520

    accuracy                           1.00    259335
   macro avg       1.00      1.00      1.00    259335
weighted avg       1.00      1.00      1.00    259335



## COMPROBACION DEL MODELO 

In [231]:
test_data = pd.read_csv('../../../datasets/fraudTest.csv')
print(test_data.shape)

(555719, 23)


In [232]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [233]:
print(test_data.dtypes)

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object


In [234]:

col = ['trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'city', 'job', 'dob', 'trans_num', 'unix_time']
test_data = test_data.drop(columns=col)

# Preprocess the test data (select features and standardize)
X_test = test_data[['category', 'amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']]
X_test = pd.get_dummies(test_data, columns=['category'], prefix=['cat']) # one-hot encoding para 'category'
print(X_test.shape)
print(X_test.dtypes)
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)
print(X_test.shape)

(555719, 22)
Unnamed: 0              int64
amt                   float64
lat                   float64
long                  float64
city_pop                int64
merch_lat             float64
merch_long            float64
is_fraud                int64
cat_entertainment        bool
cat_food_dining          bool
cat_gas_transport        bool
cat_grocery_net          bool
cat_grocery_pos          bool
cat_health_fitness       bool
cat_home                 bool
cat_kids_pets            bool
cat_misc_net             bool
cat_misc_pos             bool
cat_personal_care        bool
cat_shopping_net         bool
cat_shopping_pos         bool
cat_travel               bool
dtype: object
(555719, 22)


In [107]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,category,amt,lat,long,city_pop,merch_lat,merch_long,is_fraud
0,0,personal_care,2.86,33.9659,-80.9355,333497,33.986391,-81.200714,0
1,1,personal_care,29.84,40.3207,-110.436,302,39.450498,-109.960431,0
2,2,health_fitness,41.28,40.6729,-73.5365,34496,40.49581,-74.196111,0
3,3,misc_pos,60.05,28.5697,-80.8191,54767,28.812398,-80.883061,0
4,4,travel,3.19,44.2529,-85.017,1126,44.959148,-85.884734,0


In [181]:
loaded_model = tf.keras.models.load_model("detector.h5")
loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                368       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 513 (2.00 KB)
Trainable params: 513 (2.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [111]:
# Use your trained model to make predictions on the test data
y_pred = loaded_model.predict(X_test)



In [229]:
y_pred[1685].astype(float)

array([1.])

In [201]:
# New column with y_pred value
test_data['predic'] = y_pred

In [202]:
# Apply the threshold (e.g., 0.5) to classify transactions as fraud or not fraud
threshold = 0.5
fraudulent_rows = test_data[y_pred > threshold]

In [204]:
# Print the rows where the model classifies transactions as fraud
print(fraudulent_rows)

        Unnamed: 0        category      amt      lat      long  city_pop  \
1685          1685  health_fitness    24.84  31.8599 -102.7413        23   
1767          1767        misc_net   780.52  42.5545  -90.3508      1306   
1781          1781   entertainment   620.33  42.5545  -90.3508      1306   
1784          1784    shopping_net  1077.69  30.4590  -90.9027     71335   
1857          1857    shopping_pos   842.65  31.8599 -102.7413        23   
...            ...             ...      ...      ...       ...       ...   
517197      517197    shopping_pos  1041.51  34.6323  -89.8855     14462   
517274      517274    shopping_pos   868.09  34.6323  -89.8855     14462   
517341      517341    shopping_net  1039.42  34.6323  -89.8855     14462   
517529      517529     grocery_pos   289.27  34.6323  -89.8855     14462   
517571      517571        misc_net   766.38  34.6323  -89.8855     14462   

        merch_lat  merch_long  is_fraud  predic  
1685    32.575873 -102.604290        

In [165]:
test_data.head(1687)

Unnamed: 0.1,Unnamed: 0,category,amt,lat,long,city_pop,merch_lat,merch_long,is_fraud
0,0,personal_care,2.86,33.9659,-80.9355,333497,33.986391,-81.200714,0
1,1,personal_care,29.84,40.3207,-110.4360,302,39.450498,-109.960431,0
2,2,health_fitness,41.28,40.6729,-73.5365,34496,40.495810,-74.196111,0
3,3,misc_pos,60.05,28.5697,-80.8191,54767,28.812398,-80.883061,0
4,4,travel,3.19,44.2529,-85.0170,1126,44.959148,-85.884734,0
...,...,...,...,...,...,...,...,...,...
1682,1682,home,89.82,39.0305,-76.5515,92106,38.400359,-76.923792,0
1683,1683,personal_care,85.45,41.4802,-86.6919,1423,41.214552,-86.874168,0
1684,1684,food_dining,57.13,36.3011,-91.5281,4726,35.614887,-92.183553,0
1685,1685,health_fitness,24.84,31.8599,-102.7413,23,32.575873,-102.604290,1


## Simulación de input

In [174]:
input_data = {
    'Unnamed: 0': 1685,
    'trans_date_trans_time': '2020-06-21 22:06:39',
    'cc_num': 3560725013359375,
    'merchant': "fraud_Hamill-D'Amore",
    'category': 'health_fitness',
    'amt': 24.84,
    'first': 'Brooke',
    'last': 'Smith',
    'gender': 'F',
    'street': '63542 Luna Brook Apt. 012',
    'city': 'Notrees',
    'state': 'TX',
    'zip': 79759,
    'lat': 31.8599,
    'long': -102.7413,
    'city_pop': 23,
    'job': 'Cytogeneticist',
    'dob': '1969-09-15',
    'trans_num': '16bf2e46c54369a8eab2214649506425',
    'unix_time': 1371852399,
    'merch_lat': 32.575873,
    'merch_long': -102.60429,
    'is_fraud': 1
}


In [175]:
# Create a DataFrame from the input data
input_df = pd.DataFrame([input_data])
print(input_df.shape)
input_df.head()

(1, 23)


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,1685,2020-06-21 22:06:39,3560725013359375,fraud_Hamill-D'Amore,health_fitness,24.84,Brooke,Smith,F,63542 Luna Brook Apt. 012,...,31.8599,-102.7413,23,Cytogeneticist,1969-09-15,16bf2e46c54369a8eab2214649506425,1371852399,32.575873,-102.60429,1


In [176]:
test_data = pd.read_csv('../../../datasets/fraudTest.csv')

In [177]:
col = ['trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'city', 'job', 'dob', 'trans_num', 'unix_time']
input_df = input_df.drop(columns=col)
print(input_df.shape)
test_data = test_data.drop(columns=col)
print(test_data.shape)
print(test_data.dtypes)

(1, 9)
(555719, 9)
Unnamed: 0      int64
category       object
amt           float64
lat           float64
long          float64
city_pop        int64
merch_lat     float64
merch_long    float64
is_fraud        int64
dtype: object


In [178]:
categories = test_data['category'].unique()
#OH_categories = pd.get_dummies(categoriesDF, columns=categories, prefix="cat")
#print(categoriesDF.head())
#print(OH_categories)
OH_categories = pd.DataFrame(False, index=[0], columns=[f'cat_{category}' for category in categories])
"""for category in categories:
    OH_categories.loc[0, f'cat_{category}'] = False"""
OH_categories.head()

Unnamed: 0,cat_personal_care,cat_health_fitness,cat_misc_pos,cat_travel,cat_kids_pets,cat_shopping_pos,cat_food_dining,cat_home,cat_entertainment,cat_shopping_net,cat_misc_net,cat_grocery_pos,cat_gas_transport,cat_grocery_net
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [179]:
input_test = input_df[['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long', 'category']]
input_test = pd.get_dummies(input_df, columns=['category'], prefix=['cat']) # one-hot encoding para 'category'
common_columns = OH_categories.columns.intersection(input_test.columns).tolist()  # Get common column names
OH_categories = OH_categories[OH_categories.columns.difference(common_columns)]
input_test = pd.concat([input_test, OH_categories], axis=1)
print(input_test.dtypes)
scaler = StandardScaler()
input_test = scaler.fit_transform(input_test)
print(input_test.shape)


Unnamed: 0              int64
amt                   float64
lat                   float64
long                  float64
city_pop                int64
merch_lat             float64
merch_long            float64
is_fraud                int64
cat_health_fitness       bool
cat_entertainment        bool
cat_food_dining          bool
cat_gas_transport        bool
cat_grocery_net          bool
cat_grocery_pos          bool
cat_home                 bool
cat_kids_pets            bool
cat_misc_net             bool
cat_misc_pos             bool
cat_personal_care        bool
cat_shopping_net         bool
cat_shopping_pos         bool
cat_travel               bool
dtype: object
(1, 22)


In [180]:
input_df.head()

Unnamed: 0.1,Unnamed: 0,category,amt,lat,long,city_pop,merch_lat,merch_long,is_fraud
0,1685,health_fitness,24.84,31.8599,-102.7413,23,32.575873,-102.60429,1


In [182]:
# Load the saved model
loaded_model = tf.keras.models.load_model("detector.h5")
loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                368       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 513 (2.00 KB)
Trainable params: 513 (2.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [226]:
# Use the loaded model for predictions
z_pred = loaded_model.predict(input_test)




In [228]:
z_pred.astype(float)

array([[1.21901871e-06]])

In [185]:
# Print the result
if z_pred > 0.5:
    print("The transaction is classified as fraud.")
else:
    print("The transaction is not classified as fraud.")

The transaction is not classified as fraud.
