In [10]:
import pandas as pd

# Load suspicious transactions from CSV
suspicious_transactions = pd.read_csv('suspicious_transactions.csv')

# Load the full transactional data from the provided URL
transactional_data_url = 'https://gist.githubusercontent.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97/raw/9ceae962009236d3570f46e59ce9aa334e4e290f/transactional-sample.csv'
full_transaction_data = pd.read_csv(transactional_data_url)

# Merge suspicious transactions with the full transactional data using transaction_id
merged_data = pd.merge(suspicious_transactions, full_transaction_data, on='transaction_id', how='inner')

# Identify user_id and merchant_id appearing more than once in suspicious transactions
duplicate_entities = merged_data[(merged_data['user_id'].duplicated()) | (merged_data['merchant_id'].duplicated())]

# Create a dictionary to store the results
result_dict = {
    'entity_id': [],
    'entity_type': [],
    'num_suspicious_transactions': [],
    'num_chargebacks': [],
   
    'sum_transaction_amount': [],
    'transaction_ids': []  # Added for storing comma-separated transaction_ids
}

# Iterate through each entity (user_id or merchant_id)
for entity_type, entity_id_col in [('user', 'user_id'), ('merchant', 'merchant_id')]:
    entities = duplicate_entities[entity_id_col].unique()

    for entity_id in entities:
        entity_data = duplicate_entities[duplicate_entities[entity_id_col] == entity_id]

        num_suspicious_transactions = entity_data.shape[0]

        # Check if the user or merchant is involved in more than one suspicious transaction
        if num_suspicious_transactions > 1:
            num_chargebacks = entity_data['has_cbk'].sum()
            num_transactions_involved = len(entity_data['transaction_id'].unique())
            sum_transaction_amount = entity_data['transaction_amount'].sum()

            # Join transaction_ids into a comma-separated string
            transaction_ids = ','.join(entity_data['transaction_id'].astype(str).unique())

            result_dict['entity_id'].append(entity_id)
            result_dict['entity_type'].append(entity_type)
            result_dict['num_suspicious_transactions'].append(num_suspicious_transactions)
            result_dict['num_chargebacks'].append(num_chargebacks)
            result_dict['sum_transaction_amount'].append(sum_transaction_amount)
            result_dict['transaction_ids'].append(transaction_ids)
   

# Create a DataFrame from the dictionary
result_df = pd.DataFrame(result_dict)

# Sort the results by the number of times involved in suspicious transactions and by the sum of transaction amounts
result_df = result_df.sort_values(by=['num_suspicious_transactions', 'sum_transaction_amount'], ascending=[False, False])

# Save the sorted results to a CSV file
result_df.to_csv('entity_summary.csv', index=False)
