In [72]:
import pandas as pd



# Load and summarize fraud evaluation results

fraud_evaluation_results_url = 'fraud_evaluation_results.csv'
fraud_evaluation_results = pd.read_csv(fraud_evaluation_results_url)


fraud_chance_counts = fraud_evaluation_results['fraud_chance'].value_counts()


fraud_chance_percentages = (fraud_chance_counts / len(fraud_evaluation_results)) * 100

# Display the counts and percentages
fraud_chance_summary = pd.DataFrame({
    'fraud_chance': fraud_chance_counts.index,
    'count': fraud_chance_counts.values,
    'percentage': fraud_chance_percentages.values
})

print(fraud_chance_summary)



# Identify users representing 51% of total suspicious transactions


transactional_data_url = 'https://gist.githubusercontent.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97/raw/9ceae962009236d3570f46e59ce9aa334e4e290f/transactional-sample.csv'
transactional_data = pd.read_csv(transactional_data_url)[['user_id', 'merchant_id', 'transaction_amount', 'transaction_id']]


entity_summary_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/entity_summary.csv'
entity_summary = pd.read_csv(entity_summary_url)[['entity_id', 'entity_type', 'num_suspicious_transactions']]


suspicious_transactions_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/suspicious_transactions.csv'
suspicious_transactions = pd.read_csv(suspicious_transactions_url)[['transaction_id', 'score']]


merged_data = pd.merge(transactional_data, suspicious_transactions, on='transaction_id', how='left')


total_suspicious_transactions = merged_data['score'].notnull().sum()
users_20_percent = entity_summary.nlargest(int(0.51 * len(entity_summary)), 'num_suspicious_transactions')['entity_id']


users_20_percent_data = merged_data[merged_data['user_id'].isin(users_20_percent)]

num_returned_users = len(users_20_percent_data['user_id'].unique())

print("Users representing 51% of total suspicious transactions:")
print(users_20_percent_data['user_id'].unique())
print(f"Total number of users returned: {num_returned_users}")



# Identify Merchants representing 51% of total suspicious transactions
import pandas as pd

transactional_data_url = 'https://gist.githubusercontent.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97/raw/9ceae962009236d3570f46e59ce9aa334e4e290f/transactional-sample.csv'
transactional_data = pd.read_csv(transactional_data_url)[['user_id', 'merchant_id', 'transaction_amount', 'transaction_id']]

entity_summary_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/entity_summary.csv'
entity_summary = pd.read_csv(entity_summary_url)[['entity_id', 'entity_type', 'num_suspicious_transactions']]

suspicious_transactions_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/suspicious_transactions.csv'
suspicious_transactions = pd.read_csv(suspicious_transactions_url)[['transaction_id', 'score']]

merged_data = pd.merge(transactional_data, suspicious_transactions, on='transaction_id', how='left')

total_suspicious_transactions = merged_data['score'].notnull().sum()
merchants_20_percent = entity_summary.nlargest(int(0.51 * len(entity_summary)), 'num_suspicious_transactions')['entity_id']

merchants_20_percent_data = merged_data[merged_data['merchant_id'].isin(merchants_20_percent)]
total_transactions_for_returned_merchants = len(merchants_20_percent_data['merchant_id'].unique())

print("Merchants representing 51% of total suspicious transactions:")
print(merchants_20_percent_data['merchant_id'].unique())
print(f"Total number of unique transactions for returned merchants: {total_transactions_for_returned_merchants}")


# Identifying suspicious transactions involving users and merchants  as pair linked more than once

transactional_data_url = 'https://gist.githubusercontent.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97/raw/9ceae962009236d3570f46e59ce9aa334e4e290f/transactional-sample.csv'
transactional_data = pd.read_csv(transactional_data_url)[['user_id', 'merchant_id', 'transaction_amount', 'transaction_id']]

entity_summary_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/entity_summary.csv'
entity_summary = pd.read_csv(entity_summary_url)[['entity_id', 'entity_type', 'num_suspicious_transactions']]

suspicious_transactions_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/suspicious_transactions.csv'
suspicious_transactions = pd.read_csv(suspicious_transactions_url)[['transaction_id', 'score']]

merged_data = pd.merge(transactional_data, suspicious_transactions, on='transaction_id', how='left')

duplicated_entities = merged_data[merged_data.duplicated(subset=['user_id', 'merchant_id'], keep=False)][['user_id', 'merchant_id']].drop_duplicates()

print("Users and merchants appearing in suspicious transactions and linked more than once in the same row:")
print(duplicated_entities)



# Identifying suspicious transactions involving users linked more than once
transactional_data_url = 'https://gist.githubusercontent.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97/raw/9ceae962009236d3570f46e59ce9aa334e4e290f/transactional-sample.csv'
transactional_data = pd.read_csv(transactional_data_url)[['user_id', 'transaction_id']]

suspicious_transactions_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/suspicious_transactions.csv'
suspicious_transactions = pd.read_csv(suspicious_transactions_url)[['transaction_id', 'score']]

merged_data = pd.merge(transactional_data, suspicious_transactions, on='transaction_id', how='inner')

total_users_in_suspicious_transactions = merged_data['user_id'].nunique()

percentage_of_36_users = (num_returned_users / total_users_in_suspicious_transactions) * 100

print("Total number of users in the list of suspicious transactions:", total_users_in_suspicious_transactions)
print("Percentage of num_returned_users in relation to the total:", round(percentage_of_36_users, 2), "%")


# Identifying suspicious transactions involving users linked more than once
transactional_data_url = 'https://gist.githubusercontent.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97/raw/9ceae962009236d3570f46e59ce9aa334e4e290f/transactional-sample.csv'
transactional_data = pd.read_csv(transactional_data_url)[['merchant_id', 'transaction_id']]

suspicious_transactions_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/suspicious_transactions.csv'
suspicious_transactions = pd.read_csv(suspicious_transactions_url)[['transaction_id', 'score']]

merged_data = pd.merge(transactional_data, suspicious_transactions, on='transaction_id', how='inner')
# Get the total number of unique merchants in the list of suspicious transactions
total_merchants_in_suspicious_transactions = merged_data['merchant_id'].nunique()

percentage_of_39_merchants = (total_transactions_for_returned_merchants / total_merchants_in_suspicious_transactions) * 100

print("Total number of merchants in the list of suspicious transactions:", total_merchants_in_suspicious_transactions)
print("Percentage of total_transactions_for_returned_merchants merchants in relation to the total:", round(percentage_of_39_merchants, 2), "%")


# Calculate chargeback total
transactional_data_url = 'https://gist.githubusercontent.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97/raw/9ceae962009236d3570f46e59ce9aa334e4e290f/transactional-sample.csv'
transactional_data = pd.read_csv(transactional_data_url)


merged_data = pd.merge(transactional_data, suspicious_transactions, on='transaction_id', how='left')

chargeback_counts = merged_data.groupby('has_cbk').size().reset_index(name='count')

total_count = chargeback_counts['count'].sum()
chargeback_counts['percentage'] = (chargeback_counts['count'] / total_count) * 100

print("Chargeback counts and percentage:")
print(chargeback_counts[['has_cbk', 'count', 'percentage']])



#Calculate chargeback high risk transactions percentage
merged_data = pd.merge(fraud_evaluation_results, transactional_data, on='transaction_id', how='left')

high_risk_data = merged_data[merged_data['fraud_chance'] == 'high']

total_high_risk_transactions = high_risk_data.shape[0]

if total_high_risk_transactions > 0:
    # Calculate the count of chargeback_true for High-risk transactions
    chargeback_true_count_high_risk = high_risk_data[high_risk_data['has_cbk'] == True].shape[0]

    # Calculate the percentage of chargeback_true in relation to the total count of High-risk transactions
    percentage_chargeback_true_high_risk = (chargeback_true_count_high_risk / total_high_risk_transactions) * 100

    # Display the results
    print("Percentage of High-risk transactions with chargeback True:")
    print(f"{percentage_chargeback_true_high_risk:.2f}%")
else:
    print("No High-risk transactions found.")


#Calculate chargeback medium-high risk transactions
merged_data = pd.merge(fraud_evaluation_results, transactional_data, on='transaction_id', how='left')

high_risk_data = merged_data[merged_data['fraud_chance'] == 'medium-high']

total_high_risk_transactions = high_risk_data.shape[0]

if total_high_risk_transactions > 0:
    chargeback_true_count_high_risk = high_risk_data[high_risk_data['has_cbk'] == True].shape[0]

    percentage_chargeback_true_high_risk = (chargeback_true_count_high_risk / total_high_risk_transactions) * 100

    print("Percentage of medium-High-risk transactions with chargeback True:")
    print(f"{percentage_chargeback_true_high_risk:.2f}%")
else:
    print("No medium-High-risk transactions found.")


# Identify pairs that appear 5 times or more together
    
top_users_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/top_users_with_merchant_suspicious_transactions.csv'
top_users_df = pd.read_csv(top_users_url)

suspicious_pairs = top_users_df[top_users_df['count'] >= 5][['user_id', 'merchant_id']]


# Calculate the sum of each pair (user and merchant)

merged_data = pd.merge(transactional_data, suspicious_pairs, on=['user_id', 'merchant_id'], how='inner')

sum_by_pair = merged_data.groupby(['user_id', 'merchant_id']).agg({'transaction_amount': 'sum'}).reset_index()

# Calculate the total sum
total_sum = sum_by_pair['transaction_amount'].sum()

print("Sum of each pair (user and merchant):")
print(sum_by_pair)
print("\nTotal sum:", total_sum)





# Filter transactional_data for transactions with chargeback True
chargeback_true_data = transactional_data[transactional_data['has_cbk'] == True]

total_sum_chargeback_true = chargeback_true_data['transaction_amount'].sum()

print("Total sum of transactions with chargeback True:", total_sum_chargeback_true)


# Identify users that appear more than once as suspicious pair

top_users_merchants_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/top_users_with_merchant_suspicious_transactions.csv'
top_users_merchants_df = pd.read_csv(top_users_merchants_url)

user_duplicates_df = top_users_merchants_df[top_users_merchants_df.duplicated(subset=['user_id'], keep=False)]

user_count_df = user_duplicates_df.groupby(['user_id']).size().reset_index(name='count')

print("Users that appear more than once with the count:")
print(user_count_df)


#Calculate the most frequent hour for high-risk transactions


transactional_data_url = 'https://gist.githubusercontent.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97/raw/9ceae962009236d3570f46e59ce9aa334e4e290f/transactional-sample.csv'
transactional_data = pd.read_csv(transactional_data_url)

fraud_evaluation_url = 'https://raw.githubusercontent.com/luizarnoni/CloudWalk---Risk-Analyst-Case/main/fraud_evaluation_results.csv'
fraud_evaluation_df = pd.read_csv(fraud_evaluation_url)

merged_data = pd.merge(transactional_data, fraud_evaluation_df, on='transaction_id')

high_risk_data = merged_data[merged_data['fraud_chance'] == 'high'].copy()

high_risk_data['transaction_date'] = pd.to_datetime(high_risk_data['transaction_date'])
high_risk_data['hour'] = high_risk_data['transaction_date'].dt.hour
high_risk_data['day_of_week'] = high_risk_data['transaction_date'].dt.dayofweek  # Monday is 0 and Sunday is 6

most_frequent_hour = high_risk_data['hour'].mode().iloc[0]

print("Most frequent hour for high-risk transactions:", most_frequent_hour)

weekdays_high_risk = high_risk_data[high_risk_data['day_of_week'] < 5].copy()  # Monday to Friday
weekends_high_risk = high_risk_data[high_risk_data['day_of_week'] >= 5].copy()  # Saturday and Sunday

# Calculate the hourly distribution for weekdays and weekends
hourly_distribution_weekdays = weekdays_high_risk['hour'].value_counts(normalize=True).sort_index()
hourly_distribution_weekends = weekends_high_risk['hour'].value_counts(normalize=True).sort_index()

# Display the results
print("\nHourly distribution for weekdays:")
print(hourly_distribution_weekdays)

print("\nHourly distribution for weekends:")
print(hourly_distribution_weekends)




  fraud_chance  count  percentage
0  no evidence   2768   86.527040
1  medium-high    251    7.846202
2       medium    108    3.376055
3         high     72    2.250703
Users representing 51% of total suspicious transactions:
[ 7725 10241 21768 11750  7695 40779 77959 17807 50643 99396 56877 71424
 69588 96025 79054 91637 17929  9853  3584 27555 86411 42677 73271 34548
 83722 78262 67519 27657 11452 58905 28218 91972 57594 49106 75710 30874]
Total number of users returned: 36
Merchants representing 51% of total suspicious transactions:
[73271 68953 42356 66876 17275 30121 81795 15326 29214 63050 38568 44927
 99510 91972 48126 97291  4153  1308 36617  4705 18267 98272 26981 55854
 89943 77130 33192 20663 65330 75917 18344 39253 36700 38337 53041 36929
 71378 49919  8942]
Total number of unique transactions for returned merchants: 39
Users and merchants appearing in suspicious transactions and linked more than once in the same row:
      user_id  merchant_id
1        2708        92895
7