# Load Card Transactions

This notebook is a direct conversion of the provided script to a Jupyter notebook. It loads the dataset, shows basic info, and creates a series of plots analyzing fraud vs other features.

In [None]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


In [None]:
# file path (adjust if necessary)
file_path = "card_transdata.csv"

# load dataset
transaction_data_raw = pd.read_csv(file_path)

# show first few rows
print("First 5 rows of raw data:")
transaction_data_raw.head()

In [None]:
# show shape
print("Shape (rows, columns):", transaction_data_raw.shape)

# general info (prints dtypes and non-null counts)
print("Info:")
transaction_data_raw.info()

# descriptive statistics for the RAW dataframe (numeric columns)
print("Descriptive statistics for transaction_data_raw (numeric columns):")
transaction_data_raw.describe()

# check for missing values by column
print("Missing values per column:")
transaction_data_raw.isnull().sum()

In [None]:
# drop any rows with null values -> cleaned dataset
transaction_data_cleaned = transaction_data_raw.dropna()

# show shape after dropping rows with nulls
print("Shape after dropping rows with nulls:", transaction_data_cleaned.shape)

# show first few rows of cleaned dataframe
transaction_data_cleaned.head()

In [None]:
# ---- Fraud class distribution plot (using cleaned dataset) ----
sns.countplot(x='fraud', data=transaction_data_cleaned)
plt.title('Fraud Class Distribution')
plt.show()

In [None]:
# ---- Used Pin Number distribution plot (using cleaned dataset) ----
sns.countplot(x='used_pin_number', data=transaction_data_cleaned)
plt.title('Used Pin Distribution')
plt.show()

In [None]:
# ---- Repeat Retailer distribution plot (using cleaned dataset) ----
sns.countplot(x='repeat_retailer', data=transaction_data_cleaned)
plt.title('Repeat Retailer Distribution')
plt.show()

In [None]:
# ---- Online Order distribution plot (using cleaned dataset) ----
sns.countplot(x='online_order', data=transaction_data_cleaned)
plt.title('Online Order Distribution')
plt.show()

In [None]:
# ---- Ratio to Median Purchase Price histogram (using cleaned dataset) ----
sns.histplot(x='ratio_to_median_purchase_price', data=transaction_data_cleaned, bins=30)
plt.title('Ratio to Median Purchase Price Distribution')
plt.show()

In [None]:
# ---- Distance From Home histogram (using cleaned dataset) ----
sns.histplot(x='distance_from_home', data=transaction_data_cleaned, bins=30)
plt.title('Distance From Home Distribution')
plt.show()

In [None]:
# ---- Transactions with PIN vs. Fraudulent Transactions (grouped countplot) ----
sns.countplot(x='used_pin_number', hue='fraud', data=transaction_data_cleaned, palette=['green', 'red'])
plt.title('Transactions with PIN vs. Fraudulent Transactions')
plt.xlabel('Used Pin')
plt.ylabel('Count')
plt.legend(title='Fraud', labels=['Non-Fraudulent', 'Fraudulent'])
plt.show()

In [None]:
# ---- Percentage of Fraudulent Transactions with PIN Usage (stacked percentage bar) ----
df_pin_fraud = transaction_data_cleaned.groupby('used_pin_number')['fraud'] \
    .value_counts(normalize=True) \
    .unstack() * 100

ax = df_pin_fraud.plot(kind='bar', stacked=True, color=['green', 'red'], figsize=(5, 3))
plt.title('Percentage of Fraudulent Transactions with PIN Usage')
plt.xlabel('Used PIN')
plt.ylabel('Percentage')
plt.legend(title='Fraud', labels=['Non-Fraudulent', 'Fraudulent'])
plt.show()

In [None]:
# ---- Transactions with Chip vs. Fraudulent Transactions (grouped countplot) ----
plt.figure(figsize=(5, 3))
sns.countplot(x='used_chip', hue='fraud', data=transaction_data_cleaned, palette=['green', 'red'])
plt.title('Transactions with Chip vs Fraudulent Transactions')
plt.xlabel('Used Chip')
plt.ylabel('Count')
plt.legend(title='Fraud', labels=['Non-Fraudulent', 'Fraudulent'])
plt.show()

In [None]:
# ---- Transactions with Online Order vs. Fraudulent Transactions (grouped countplot) ----
plt.figure(figsize=(5, 3))
sns.countplot(x='online_order', hue='fraud', data=transaction_data_cleaned, palette=['green', 'red'])
plt.title('Transactions with Online Order  vs Fraudulent Transactions')
plt.xlabel('Online Order')
plt.ylabel('Count')
plt.legend(title='Fraud', labels=['Non-Fraudulent', 'Fraudulent'])
plt.show()

In [None]:
# ---- Scatter plot: Distance From Home vs. Ratio to Median Purchase Price (fraud broken down) ----
plt.figure(figsize=(10, 6))
scatter = sns.scatterplot(
    x='distance_from_home',
    y='ratio_to_median_purchase_price',
    hue='fraud',
    palette={0: 'green', 1: 'red'},
    data=transaction_data_cleaned
)
plt.title('Fraudulent vs Non-Fraudulent Transactions: Distance From Home vs Ratio to Median Purchase Price')
plt.xlabel('Distance From Home')
plt.ylabel('Ratio to Median Purchase Price')
plt.legend(title='Fraud', labels=['Non-Fraudulent', 'Fraudulent'])
plt.show()

In [None]:
# ---- Scatter plot: Distance From Last Transaction vs. Ratio to Median Purchase Price (fraud broken down) ----
plt.figure(figsize=(10, 6))
scatter2 = sns.scatterplot(
    x='distance_from_last_transaction',
    y='ratio_to_median_purchase_price',
    hue='fraud',
    palette={0: 'green', 1: 'red'},
    data=transaction_data_cleaned
)
plt.title('Fraudulent vs Non-Fraudulent Transactions: Distance From Last Transaction vs Ratio to Median Purchase Price')
plt.xlabel('Distance from Last Transaction')
plt.ylabel('Ratio to Median Purchase Price')
plt.legend(title='Fraud', labels=['Non-Fraudulent', 'Fraudulent'])
plt.show()