In [14]:
import pandas as pd

df = pd.read_csv('https://gist.githubusercontent.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97/raw/9ceae962009236d3570f46e59ce9aa334e4e290f/transactional-sample.csv')

df['score'] = 0
df['suspicion_reason'] = ''

# Detecting transactions with abnormally high values
df['user_mean'] = df.groupby('user_id')['transaction_amount'].transform('mean')
df['user_std'] = df.groupby('user_id')['transaction_amount'].transform('std')

upper_limit_user = df['user_mean'] + 2 * df['user_std']
df.loc[df['transaction_amount'] > upper_limit_user, 'score'] += 1
df.loc[df['transaction_amount'] > upper_limit_user, 'suspicion_reason'] += 'Very high value for the same user; '

# Detecting transactions on different devices in a short period
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['time_diff'] = df.groupby('user_id')['transaction_date'].diff().dt.total_seconds()

time_limit = 3600  # 1-hour limit between transactions for the same user

# Calculating the rolling moving average
df['moving_average'] = df.groupby('user_id')['device_id'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

df['device_history'] = df.groupby('user_id')['moving_average'].transform(lambda x: ','.join(x.dropna().astype(str).unique()))

df.loc[(df['time_diff'] < time_limit) & (df.duplicated(subset=['user_id', 'device_id'], keep=False)), 'score'] += 1
df.loc[(df['time_diff'] < time_limit) & (df.duplicated(subset=['user_id', 'device_id'], keep=False)), 'suspicion_reason'] += 'Fast transactions on different devices; '

# Detecting transactions at unusual hours
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['hour'] = df['transaction_date'].dt.hour

# Calculating the average time of the last 10 purchases for each user
df['average_last_10_hours'] = df.groupby('user_id')['hour'].transform(lambda x: x.rolling(10, min_periods=1).mean())

# Setting a discrepancy limit to consider as suspicious
hour_discrepancy_limit = 4  # You can adjust as needed

df.loc[abs(df['hour'] - df['average_last_10_hours']) > hour_discrepancy_limit, 'score'] += 1
df.loc[abs(df['hour'] - df['average_last_10_hours']) > hour_discrepancy_limit, 'suspicion_reason'] += 'Transaction at an unusual hour compared to the last 10 purchases; '

# Chargeback detected
df.loc[df['has_cbk'], 'score'] += 1
df.loc[df['has_cbk'], 'suspicion_reason'] += 'Chargeback detected; '

# Merchant receives from the same card more than once in a short time
merchant_card_time_limit = 3600  # 1-hour limit in seconds

df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['time_diff_merchant_card'] = df.groupby(['card_number', 'merchant_id'])['transaction_date'].diff().dt.total_seconds()

df.loc[(df['time_diff_merchant_card'] < merchant_card_time_limit) & (df['time_diff_merchant_card'] > 0), 'score'] += 1
df.loc[(df['time_diff_merchant_card'] < merchant_card_time_limit) & (df['time_diff_merchant_card'] > 0), 'suspicion_reason'] += 'Same card paying the same merchant several times in 1 hour; '

# Identifying merchants with a high chargeback rate
chargeback_merchant_limit = 0.02  # 2% limit

df['chargeback_history'] = df.groupby('merchant_id')['has_cbk'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())

df.loc[(df['has_cbk']) & (df['chargeback_history'] > chargeback_merchant_limit), 'score'] += 1
df.loc[(df['has_cbk']) & (df['chargeback_history'] > chargeback_merchant_limit), 'suspicion_reason'] += 'Merchant with more than 2% chargeback in the history; '

# Identifying cards with more than 2% true chargeback
chargeback_card_limit = 0.02  # 2% limit

df['chargeback_history_card'] = df.groupby('card_number')['has_cbk'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())

df.loc[df['chargeback_history_card'] > chargeback_card_limit, 'score'] += 1
df.loc[df['chargeback_history_card'] > chargeback_card_limit, 'suspicion_reason'] += 'Card with more than 2% chargeback in the history; '

# Filtering suspicious transactions
suspicious_transactions = df[df['score'] > 0]

# Sorting suspicious transactions in descending order of score, showing those with true chargeback first
suspicious_transactions = suspicious_transactions.sort_values(by=['has_cbk', 'score'], ascending=[False, False])

# Saving suspicious transactions to a new CSV file
suspicious_transactions.to_csv('suspicious_transactions.csv', index=False, columns=['transaction_id', 'suspicion_reason', 'score'])
