In [16]:
# Library Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# Load the cc table
cc_df = pd.read_csv('/content/cc_info.csv')

# Load the transactions table
transaction_df = pd.read_csv('/content/transactions.csv')

# Perform inner join on the "credit_card" column
merged_data = pd.merge(transaction_df, cc_df, on="credit_card", how="inner")

# Print the merged data
print(merged_data)

             credit_card                 date  transaction_dollar_amount  \
0       1003715054175576  2015-09-11 00:32:40                      43.78   
1       1003715054175576  2015-10-24 22:23:08                     103.15   
2       1003715054175576  2015-10-26 18:19:36                      48.55   
3       1003715054175576  2015-10-22 19:41:10                     136.18   
4       1003715054175576  2015-10-26 20:08:22                      71.82   
...                  ...                  ...                        ...   
294583  9999757432802760  2015-09-10 19:43:33                     127.23   
294584  9999757432802760  2015-08-06 21:00:13                      84.90   
294585  9999757432802760  2015-09-22 16:15:47                      77.54   
294586  9999757432802760  2015-08-27 18:08:24                     144.05   
294587  9999757432802760  2015-08-22 00:14:52                     154.36   

             Long        Lat        city state  zipcode  credit_card_limit  
0      -80

In [3]:
# Group by credit_card and find the most occurred latitude and longitude
most_occurred_lat_lng = merged_data.groupby('credit_card')['Lat', 'Long'].agg(lambda x: x.value_counts().index[0]).reset_index()

# Rename the columns
most_occurred_lat_lng.columns = ['credit_card', 'Lat_most_occurred', 'Long_most_occurred']

# Merge the most_occurred_lat_lng DataFrame with the merged_data DataFrame
merged_data_df = pd.merge(merged_data, most_occurred_lat_lng, on='credit_card', how='left')

print(merged_data_df)


  most_occurred_lat_lng = merged_data.groupby('credit_card')['Lat', 'Long'].agg(lambda x: x.value_counts().index[0]).reset_index()


             credit_card                 date  transaction_dollar_amount  \
0       1003715054175576  2015-09-11 00:32:40                      43.78   
1       1003715054175576  2015-10-24 22:23:08                     103.15   
2       1003715054175576  2015-10-26 18:19:36                      48.55   
3       1003715054175576  2015-10-22 19:41:10                     136.18   
4       1003715054175576  2015-10-26 20:08:22                      71.82   
...                  ...                  ...                        ...   
294583  9999757432802760  2015-09-10 19:43:33                     127.23   
294584  9999757432802760  2015-08-06 21:00:13                      84.90   
294585  9999757432802760  2015-09-22 16:15:47                      77.54   
294586  9999757432802760  2015-08-27 18:08:24                     144.05   
294587  9999757432802760  2015-08-22 00:14:52                     154.36   

             Long        Lat        city state  zipcode  credit_card_limit  \
0      -8

In [4]:
# Convert 'date' column to datetime type
merged_data_df['date'] = pd.to_datetime(merged_data_df['date'])

# Sort the data by 'credit_card' and 'date' columns
merged_data_df.sort_values(['credit_card', 'date'], inplace=True)

# Calculate the time difference between consecutive rows for each credit card
merged_data_df['time_diff'] = merged_data_df.groupby('credit_card')['date'].diff()

# Convert the time difference to seconds
merged_data_df['time_diff_seconds'] = merged_data_df['time_diff'].dt.total_seconds()

print(merged_data_df)

             credit_card                date  transaction_dollar_amount  \
38      1003715054175576 2015-07-31 20:03:05                      45.52   
194     1003715054175576 2015-07-31 20:25:28                      96.10   
107     1003715054175576 2015-07-31 23:09:32                      20.94   
124     1003715054175576 2015-08-01 10:48:03                      51.27   
137     1003715054175576 2015-08-01 17:43:43                     127.99   
...                  ...                 ...                        ...   
294582  9999757432802760 2015-10-23 20:47:23                     216.30   
294503  9999757432802760 2015-10-24 01:12:54                     233.97   
294508  9999757432802760 2015-10-25 21:53:33                     177.51   
294519  9999757432802760 2015-10-27 21:38:09                     146.37   
294514  9999757432802760 2015-10-29 21:56:30                     195.25   

             Long        Lat        city state  zipcode  credit_card_limit  \
38     -80.186336  40

In [5]:
# Convert the date column to datetime if it's not already in datetime format
merged_data_df['date'] = pd.to_datetime(merged_data_df['date'])

# Filter the rows based on the date range
merged_data_df = merged_data_df[(merged_data_df['date'] >= '2015-08-01') & (merged_data_df['date'] <= '2015-10-30')]

# Print the filtered data
print(merged_data_df)


             credit_card                date  transaction_dollar_amount  \
124     1003715054175576 2015-08-01 10:48:03                      51.27   
137     1003715054175576 2015-08-01 17:43:43                     127.99   
80      1003715054175576 2015-08-01 17:44:19                      96.97   
103     1003715054175576 2015-08-01 19:10:09                      97.35   
81      1003715054175576 2015-08-02 19:39:29                     131.43   
...                  ...                 ...                        ...   
294582  9999757432802760 2015-10-23 20:47:23                     216.30   
294503  9999757432802760 2015-10-24 01:12:54                     233.97   
294508  9999757432802760 2015-10-25 21:53:33                     177.51   
294519  9999757432802760 2015-10-27 21:38:09                     146.37   
294514  9999757432802760 2015-10-29 21:56:30                     195.25   

             Long        Lat        city state  zipcode  credit_card_limit  \
124    -80.176899  40

In [6]:
# Replace 'NaT' values in 'time_diff' column with 0
merged_data_df['time_diff'].fillna(pd.Timedelta(seconds=0), inplace=True)

# Replace 'NaN' values in 'time_diff_seconds' column with 0
merged_data_df['time_diff_seconds'].fillna(0, inplace=True)

print(merged_data_df)

             credit_card                date  transaction_dollar_amount  \
124     1003715054175576 2015-08-01 10:48:03                      51.27   
137     1003715054175576 2015-08-01 17:43:43                     127.99   
80      1003715054175576 2015-08-01 17:44:19                      96.97   
103     1003715054175576 2015-08-01 19:10:09                      97.35   
81      1003715054175576 2015-08-02 19:39:29                     131.43   
...                  ...                 ...                        ...   
294582  9999757432802760 2015-10-23 20:47:23                     216.30   
294503  9999757432802760 2015-10-24 01:12:54                     233.97   
294508  9999757432802760 2015-10-25 21:53:33                     177.51   
294519  9999757432802760 2015-10-27 21:38:09                     146.37   
294514  9999757432802760 2015-10-29 21:56:30                     195.25   

             Long        Lat        city state  zipcode  credit_card_limit  \
124    -80.176899  40

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_df['time_diff'].fillna(pd.Timedelta(seconds=0), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_df['time_diff_seconds'].fillna(0, inplace=True)


In [7]:
# Apply the fraud conditions and assign the fraud labels
merged_data_df['Fraud_Label'] = np.where((abs(merged_data_df['Lat'] - merged_data_df['Lat_most_occurred']) > 30) |
                                         (abs(merged_data_df['Long'] - merged_data_df['Long_most_occurred']) > 40) |
                                      (merged_data_df['transaction_dollar_amount'] > 450) |
                                         ((merged_data_df['time_diff_seconds'] > 0) &
                                      (merged_data_df['time_diff_seconds'] <= 1000)),
                                      1, 0)
print(merged_data_df)


             credit_card                date  transaction_dollar_amount  \
124     1003715054175576 2015-08-01 10:48:03                      51.27   
137     1003715054175576 2015-08-01 17:43:43                     127.99   
80      1003715054175576 2015-08-01 17:44:19                      96.97   
103     1003715054175576 2015-08-01 19:10:09                      97.35   
81      1003715054175576 2015-08-02 19:39:29                     131.43   
...                  ...                 ...                        ...   
294582  9999757432802760 2015-10-23 20:47:23                     216.30   
294503  9999757432802760 2015-10-24 01:12:54                     233.97   
294508  9999757432802760 2015-10-25 21:53:33                     177.51   
294519  9999757432802760 2015-10-27 21:38:09                     146.37   
294514  9999757432802760 2015-10-29 21:56:30                     195.25   

             Long        Lat        city state  zipcode  credit_card_limit  \
124    -80.176899  40

In [8]:
# count no of labels with 1
count_labels_1 = merged_data_df['Fraud_Label'].sum()
print("Number of labels with value 1:", count_labels_1)

Number of labels with value 1: 50885


In [9]:
# design a dataset by eliminating credit card, date, state, city for model building
data_df = merged_data_df[['transaction_dollar_amount','Lat','Long',
                          'credit_card_limit','Fraud_Label']]
print(data_df)

        transaction_dollar_amount        Lat       Long  credit_card_limit  \
124                         51.27  40.313324 -80.176899              20000   
137                        127.99  40.295995 -80.226671              20000   
80                          96.97  40.286490 -80.144866              20000   
103                         97.35  40.203355 -80.163659              20000   
81                         131.43  40.224044 -80.217836              20000   
...                           ...        ...        ...                ...   
294582                     216.30  32.991054 -82.443294               6000   
294503                     233.97  32.934690 -82.410848               6000   
294508                     177.51  32.997676 -82.452819               6000   
294519                     146.37  32.952887 -82.326567               6000   
294514                     195.25  33.061478 -82.440314               6000   

        Fraud_Label  
124               0  
137               0

In [10]:
# checking the datatype and unique value of Fraud_Label
unique_values = data_df['Fraud_Label'].unique()
print(unique_values)
print(data_df['Fraud_Label'].dtype)

[0 1]
int64


In [11]:
# Perform Mann-Whitney U Test to decide the significance of features on Fraud Label
import scipy.stats as stats

fraud_data = data_df[data_df['Fraud_Label'] == 1]
non_fraud_data = data_df[data_df['Fraud_Label'] == 0]

feature = ['transaction_dollar_amount', 'Lat', 'Long', 'credit_card_limit']

for i in feature:
  statistic, p_value = stats.mannwhitneyu(fraud_data[i], non_fraud_data[i])

  print("Mann-Whitney U test results:")
  print(f"Statistic: {statistic}")
  print(f"P-value: {p_value}")

Mann-Whitney U test results:
Statistic: 6925665188.5
P-value: 0.0
Mann-Whitney U test results:
Statistic: 6092647125.0
P-value: 0.2255741046220866
Mann-Whitney U test results:
Statistic: 6646436443.0
P-value: 5.601983823859275e-247
Mann-Whitney U test results:
Statistic: 6929378221.0
P-value: 0.0


In [12]:
# Perform the fraud detection with NN
class FraudDetectionModel:
    def __init__(self, data_df):
        # initializing the data, feature, target, model and train,test, validation data
        self.data_df = data_df
        self.features = ['transaction_dollar_amount',  'Long', 'credit_card_limit']
        self.target = 'Fraud_Label'
        self.X_train, self.X_val, self.X_test, self.y_train, self.y_val, self.y_test = self._prepare_data()
        self.model = self._build_model()

    def _prepare_data(self):
        # Split the data into training and testing sets
        X = self.data_df[self.features]
        y = self.data_df[self.target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2,
                                                          random_state=42)
        # Scale the features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.fit_transform(X_val)
        X_test = scaler.transform(X_test)
        class_counts = pd.Series(y_train).value_counts()
        print('class count of y_train:',class_counts)

        return X_train, X_val, X_test, y_train, y_val, y_test

    def _build_model(self):
        # Build a neural network model
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(128, activation='relu',
                                  input_shape=(len(self.features),),
                                  kernel_regularizer=l1(0.001)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

        # Compile the model
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        return model

    def train(self, epochs=10, batch_size=64):
        # Define early stopping callback
        early_stopping = EarlyStopping(monitor='val_loss', patience=3,
                                       restore_best_weights=True)

        # Train the model
        self.model.fit(self.X_train, self.y_train, epochs=epochs,
                       validation_data=(self.X_val, self.y_val),
                       batch_size=batch_size,
                       callbacks=[early_stopping],
                       verbose=1)

    def evaluate(self):
        # Evaluate the model
        loss, accuracy = self.model.evaluate(self.X_test, self.y_test, verbose=0)
        print('Test Loss:', loss)
        print('Test Accuracy:', accuracy)
        y_pred_prob = self.model.predict(self.X_test)
        y_pred = (y_pred_prob >= 0.5).astype(int)

        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)
        print('Precision:', precision)
        print('Recall:', recall)
        print('F1 Score:', f1)

        cm = confusion_matrix(self.y_test, y_pred)
        print('Confusion Matrix:')
        print(cm)



# Create an instance of the FraudDetectionModel
model = FraudDetectionModel(data_df)

# Train the model
model.train(epochs=10, batch_size=32)

# Evaluate the model
model.evaluate()



class count of y_train: 0    152756
1     32547
Name: Fraud_Label, dtype: int64
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.4012356698513031
Test Accuracy: 0.8568418622016907
Precision: 0.9974541751527495
Recall: 0.19123389301054275
F1 Score: 0.3209370904325033
Confusion Matrix:
[[47659     5]
 [ 8285  1959]]


As the model got overfitted due to data imbalancy, we are now going with oversampling techniques to balance the 0,1 in target label and apply the NN model

In [17]:
class FraudDetectionModel:
    def __init__(self, data_df):
        self.data_df = data_df
        self.features = ['transaction_dollar_amount', 'Long', 'credit_card_limit']
        self.target = 'Fraud_Label'
        self.X_train, self.X_val, self.X_test, self.y_train, self.y_val, self.y_test = self._prepare_data()
        self.model = self._build_model()

    def _prepare_data(self):
        # Separate features and target
        X = self.data_df[self.features].values
        y = self.data_df[self.target].values

        # Split data into training, validation, and testing sets
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

        # Apply ADASYN oversampling
        adasyn = ADASYN(random_state=42)
        X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

        # Scale the features
        scaler = StandardScaler()
        X_train_resampled = scaler.fit_transform(X_train_resampled)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
        class_counts = pd.Series(y_train_resampled).value_counts()
        print('class count of y_train:',class_counts)
        return X_train_resampled, X_val, X_test, y_train_resampled, y_val, y_test

    def _build_model(self):
        # Build a neural network model
        model = Sequential()
        model.add(Dense(64, activation='relu', input_shape=(len(self.features),), kernel_regularizer=l1(0.001)))
        model.add(Dropout(0.5))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

        # Compile the model
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

        return model

    def train(self, epochs=10, batch_size=64):
        # Train the model
        self.model.fit(self.X_train, self.y_train, epochs=epochs, batch_size=batch_size, validation_data=(self.X_val, self.y_val))

    def evaluate(self):
        # Evaluate the model
        loss, accuracy = self.model.evaluate(self.X_test, self.y_test)
        print("Test Loss:", loss)
        print("Test Accuracy:", accuracy)
        y_pred_prob = self.model.predict(self.X_test)
        y_pred = (y_pred_prob >= 0.5).astype(int)
        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)
        print('Precision:', precision)
        print('Recall:', recall)
        print('F1 Score:', f1)

        cm = confusion_matrix(self.y_test, y_pred)
        print('Confusion Matrix:')
        print(cm)


# Prepare the data
data_df = data_df[['transaction_dollar_amount', 'Long', 'credit_card_limit', 'Fraud_Label']]

# Create an instance of the FraudDetectionModel class
model = FraudDetectionModel(data_df)

# Train the model
model.train()

# Evaluate the model
model.evaluate()


class count of y_train: 1    157495
0    152756
dtype: int64
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.6619413495063782
Test Accuracy: 0.5559853315353394
Precision: 0.23162606704143243
Recall: 0.6516009371339321
F1 Score: 0.3417643745839947
Confusion Matrix:
[[25521 22143]
 [ 3569  6675]]
