# Day 11: Payment Fraud Risk Detection in Online Transactions

You are a data analyst in Stripe's risk management team investigating transaction patterns to identify potential fraud. The team needs to develop a systematic approach to screen transactions for financial risks. Your goal is to create an initial risk assessment methodology using transaction characteristics.

In [None]:
import pandas as pd
import numpy as np

dim_risk_flags_data = [
  {
    "risk_level": "Low",
    "risk_flag_id": 1,
    "transaction_id": 2
  },
  {
    "risk_level": "Medium",
    "risk_flag_id": 2,
    "transaction_id": 7
  },
  {
    "risk_level": "High",
    "risk_flag_id": 3,
    "transaction_id": 11
  },
  {
    "risk_level": "High",
    "risk_flag_id": 4,
    "transaction_id": 12
  },
  {
    "risk_level": "High",
    "risk_flag_id": 5,
    "transaction_id": 13
  },
  {
    "risk_level": "Medium",
    "risk_flag_id": 6,
    "transaction_id": 14
  },
  {
    "risk_level": "High",
    "risk_flag_id": 7,
    "transaction_id": 15
  },
  {
    "risk_level": "Low",
    "risk_flag_id": 8,
    "transaction_id": 1
  },
  {
    "risk_level": "Medium",
    "risk_flag_id": 9,
    "transaction_id": 6
  },
  {
    "risk_level": "Low",
    "risk_flag_id": 10,
    "transaction_id": 3
  }
]
dim_risk_flags = pd.DataFrame(dim_risk_flags_data)

fct_transactions_data = [
  {
    "customer_email": "alice@gmail.com",
    "transaction_id": 1,
    "transaction_date": "2024-10-05",
    "transaction_amount": 120,
    "fraud_detection_score": 10
  },
  {
    "customer_email": "bob@customdomain.com",
    "transaction_id": 2,
    "transaction_date": "2024-10-15",
    "transaction_amount": 250.5,
    "fraud_detection_score": 20
  },
  {
    "customer_email": "charlie@yahoo.com",
    "transaction_id": 3,
    "transaction_date": "2024-10-20",
    "transaction_amount": 75.25,
    "fraud_detection_score": 15
  },
  {
    "customer_email": "dana@hotmail.com",
    "transaction_id": 4,
    "transaction_date": "2024-10-25",
    "transaction_amount": 100,
    "fraud_detection_score": 30
  },
  {
    "customer_email": "eve@biz.org",
    "transaction_id": 5,
    "transaction_date": "2024-10-30",
    "transaction_amount": 300,
    "fraud_detection_score": 40
  },
  {
    "customer_email": "frank@gmail.com",
    "transaction_id": 6,
    "transaction_date": "2024-11-03",
    "transaction_amount": 150.75,
    "fraud_detection_score": 25
  },
  {
    "customer_email": "grace@outlook.com",
    "transaction_id": 7,
    "transaction_date": "2024-11-10",
    "transaction_amount": null,
    "fraud_detection_score": 50
  },
  {
    "customer_email": "ivan@yahoo.com",
    "transaction_id": 8,
    "transaction_date": "2024-11-15",
    "transaction_amount": 200,
    "fraud_detection_score": 35
  },
  {
    "customer_email": "judy@hotmail.com",
    "transaction_id": 9,
    "transaction_date": "2024-11-21",
    "transaction_amount": 250,
    "fraud_detection_score": 45
  },
  {
    "customer_email": "ken@domain.net",
    "transaction_id": 10,
    "transaction_date": "2024-11-29",
    "transaction_amount": 300,
    "fraud_detection_score": 55
  },
  {
    "customer_email": "laura@riskmail.com",
    "transaction_id": 11,
    "transaction_date": "2024-12-02",
    "transaction_amount": 100,
    "fraud_detection_score": 80
  },
  {
    "customer_email": "mike@securepay.com",
    "transaction_id": 12,
    "transaction_date": "2024-12-03",
    "transaction_amount": 180,
    "fraud_detection_score": 85
  },
  {
    "customer_email": "nina@trusthub.com",
    "transaction_id": 13,
    "transaction_date": "2024-12-09",
    "transaction_amount": 220,
    "fraud_detection_score": 90
  },
  {
    "customer_email": "oscar@fintech.com",
    "transaction_id": 14,
    "transaction_date": "2024-12-16",
    "transaction_amount": 140,
    "fraud_detection_score": 70
  },
  {
    "customer_email": "paula@alertsys.com",
    "transaction_id": 15,
    "transaction_date": "2024-12-23",
    "transaction_amount": 260,
    "fraud_detection_score": 95
  }
]
fct_transactions = pd.DataFrame(fct_transactions_data)


## Question 1

How many transactions in October 2024 have a customer email ending with a domain other than 'gmail.com', 'yahoo.com', or 'hotmail.com'? This metric will help us identify transactions associated with less common email providers that may indicate emerging risk patterns.

In [None]:
# Ensure dates are datetimes
fct_transactions['transaction_date'] = pd.to_datetime(fct_transactions['transaction_date'], errors='coerce')

# Filter for October 2024
oct_mask = (
    (fct_transactions['transaction_date'].dt.year == 2024) &
    (fct_transactions['transaction_date'].dt.month == 10)
)
oct_df = fct_transactions.loc[oct_mask].copy()

# Normalize emails and extract domain (handle missing / malformed emails)
oct_df['customer_email'] = oct_df['customer_email'].astype(str).str.strip().str.lower()
# Extract part after last '@'
oct_df['email_domain'] = oct_df['customer_email'].str.split('@').str[-1].where(
    oct_df['customer_email'].str.contains('@'), ''
)

# Define common domains to exclude
common_domains = {'gmail.com', 'yahoo.com', 'hotmail.com'}

# Count transactions whose domain is NOT one of the common domains
non_common_mask = ~oct_df['email_domain'].isin(common_domains)
count_non_common = int(oct_df.loc[non_common_mask].shape[0])

# Show total and percent
total_october = int(oct_df.shape[0])
percent_non_common = (count_non_common / total_october * 100) if total_october > 0 else 0.0

print(f"Transactions in Oct 2024: {total_october}")
print(f"Transactions with non-common email domains: {count_non_common} ({percent_non_common:.2f}%)")

# List of top non-common domains and their counts
top_non_common = oct_df.loc[non_common_mask, 'email_domain'].value_counts().head(20)
print("\nTop non-common email domains (sample):")
print(top_non_common)

## Question 2

For transactions occurring in November 2024, what is the average transaction amount, using 0 as a default for any missing values? This calculation will help us detect abnormal transaction amounts that could be related to fraudulent activity.

In [None]:
# Ensure dates are in datetime format
fct_transactions['transaction_date'] = pd.to_datetime(fct_transactions['transaction_date'], errors='coerce')

# Filter for November 2024
nov_mask = (
    (fct_transactions['transaction_date'].dt.year == 2024) &
    (fct_transactions['transaction_date'].dt.month == 11)
)
nov_df = fct_transactions.loc[nov_mask].copy()

# Replace missing transaction_amount values with 0
nov_df['transaction_amount'] = nov_df['transaction_amount'].fillna(0)

# Calculate average transaction amount
avg_amount = nov_df['transaction_amount'].mean()

print(f"Average transaction amount for Nov 2024 (with NaNs replaced by 0): {avg_amount:.2f}")

## Question 3

Among transactions flagged as 'High' risk in December 2024, which day of the week recorded the highest number of such transactions? This analysis is intended to pinpoint specific days with concentrated high-risk activity and support the development of our preliminary fraud detection score.

In [None]:
# Ensure datetime format for transaction_date
fct_transactions['transaction_date'] = pd.to_datetime(fct_transactions['transaction_date'], errors='coerce')

# Merge transactions with risk flags
merged_df = fct_transactions.merge(dim_risk_flags, on='transaction_id', how='inner')

# Filter for High risk in December 2024
dec_high = merged_df[
    (merged_df['risk_level'] == 'High') &
    (merged_df['transaction_date'].dt.year == 2024) &
    (merged_df['transaction_date'].dt.month == 12)
].copy()

# Extract day of the week (Monday=0, Sunday=6) and name
dec_high['day_of_week'] = dec_high['transaction_date'].dt.day_name()

# Count transactions per day
day_counts = dec_high['day_of_week'].value_counts()

# Find the day with the highest number of transactions
most_common_day = day_counts.idxmax()
most_common_count = day_counts.max()

print(f"The day with the highest number of High-risk transactions in Dec 2024 is {most_common_day} with {most_common_count} transactions.")

Made with ❤️ by [Interview Master](https://www.interviewmaster.ai)