In [24]:
# Import necessary libraries
import pandas as pd  # For data manipulation and handling tabular data
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
from sklearn.linear_model import LogisticRegression  # To implement logistic regression for classification
from sklearn.metrics import accuracy_score, confusion_matrix  # To evaluate model performance


In [25]:
# Load the dataset from a URL
# The dataset contains transaction details and a label indicating whether the transaction is fraudulent (1) or not (0)
link_to_data = 'https://raw.githubusercontent.com/SIT719/2020-S2/master/data/payment_fraud.csv'
dfp = pd.read_csv(link_to_data, on_bad_lines='skip')  # Handles any bad lines in the CSV file gracefully

In [26]:
# Display the dataset
dfp

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
0,29,1,4.745402,paypal,28.204861,0
1,725,1,4.742303,storecredit,0.000000,0
2,845,1,4.921318,creditcard,0.000000,0
3,503,1,4.886641,creditcard,0.000000,0
4,2000,1,5.040929,creditcard,0.000000,0
...,...,...,...,...,...,...
39216,986,1,4.836982,creditcard,0.000000,0
39217,1647,1,4.876771,creditcard,377.930556,0
39218,1591,1,4.742303,creditcard,0.000000,0
39219,237,1,4.921318,creditcard,236.082639,0


In [27]:
# Convert categorical variable 'paymentMethod' into dummy variables, columns are created for each unique value in the column,
#  with the default value being boolean True or False
df = pd.get_dummies(dfp, columns=['paymentMethod'])

In [28]:
# Display the dataset with dummy variables
df

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
0,29,1,4.745402,28.204861,0,False,True,False
1,725,1,4.742303,0.000000,0,False,False,True
2,845,1,4.921318,0.000000,0,True,False,False
3,503,1,4.886641,0.000000,0,True,False,False
4,2000,1,5.040929,0.000000,0,True,False,False
...,...,...,...,...,...,...,...,...
39216,986,1,4.836982,0.000000,0,True,False,False
39217,1647,1,4.876771,377.930556,0,True,False,False
39218,1591,1,4.742303,0.000000,0,True,False,False
39219,237,1,4.921318,236.082639,0,True,False,False


In [29]:
# Split the data into training and testing sets
# The 'label' column is dropped from the features (X) and stored in the taget variable (y)
# Data is split into 67% training and 33% testing sets
# random_state is set to 17 creating a random split but also allows for consistent results
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('label', axis=1), df['label'],
    test_size=0.33, random_state=17)

In [30]:
# Display training set features
X_train

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
13272,4,1,4.886641,0.000000,True,False,False
22716,1554,1,4.461622,0.256944,False,False,True
30255,2,1,4.742303,1.840278,True,False,False
7676,2000,1,4.748314,0.584028,True,False,False
15105,157,1,5.017904,156.022222,True,False,False
...,...,...,...,...,...,...,...
31044,2000,1,4.921349,0.000000,False,True,False
25631,47,1,4.962055,0.000000,True,False,False
33174,2000,1,4.748314,0.000000,True,False,False
34959,82,1,4.745402,0.000000,True,False,False


In [31]:
# Display training set labels
y_train

Unnamed: 0,label
13272,0
22716,0
30255,0
7676,0
15105,0
...,...
31044,0
25631,0
33174,0
34959,0


In [32]:
# Create an instance of logistic regression classifier
clf = LogisticRegression()
# Train the logistic regression model using training data (features and labels)
clf.fit(X_train, y_train)

In [33]:
# Predict the labels for the test set using the training model
y_pred = clf.predict(X_test)

In [34]:
# Calculate and print the accuracy of the model
print(accuracy_score(y_pred, y_test))

1.0


In [35]:
# Calculate and print the confusion matrix; showing true positive, true negative, false positive and false negative results
print(confusion_matrix(y_test, y_pred))

[[12753     0]
 [    0   190]]
