## 1.Dataset Modules

In [1]:
def read_transactions_file(file_path):
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Initialize an empty dictionary to store the transaction data
        transactions_data = {}

        # Read the file line by line
        for line in file:
            # Split each line into its components
            components = line.strip().split(':')

            # Extract the relevant attributes/features
            user_id = components[0]
            transaction_id = components[1]
            description = components[2]
            amount = float(components[3])
            x_coordinate = float(components[4])
            y_coordinate = float(components[5])
            is_fraudulent = bool(components[6])

            # Create a dictionary with the extracted data and add it to the transactions_data dictionary
            transaction_data = {'user_id': user_id, 'transaction_id': transaction_id, 'description': description,
                                'amount': amount, 'x_coordinate': x_coordinate, 'y_coordinate': y_coordinate,
                                'is_fraudulent': is_fraudulent}
            transactions_data[transaction_id] = transaction_data

    # Return the dictionary with the transaction data
    return transactions_data


In [2]:
transactions_data = read_transactions_file('datasets/Transaction.txt')
transactions_data


{'500000': {'user_id': '21',
  'transaction_id': '500000',
  'description': 'HMS HOST-BOS AIRPT',
  'amount': 36.55,
  'x_coordinate': 884.0,
  'y_coordinate': 755.0,
  'is_fraudulent': True},
 '500001': {'user_id': '21',
  'transaction_id': '500001',
  'description': 'H & M',
  'amount': 72.85,
  'x_coordinate': 727.0,
  'y_coordinate': 827.0,
  'is_fraudulent': True},
 '500002': {'user_id': '21',
  'transaction_id': '500002',
  'description': 'VERIZON WEB',
  'amount': 61.17,
  'x_coordinate': 926.0,
  'y_coordinate': 841.0,
  'is_fraudulent': True},
 '500003': {'user_id': '21',
  'transaction_id': '500003',
  'description': "MARIO'S ITALIAN RSTR",
  'amount': 47.02,
  'x_coordinate': 860.0,
  'y_coordinate': 886.0,
  'is_fraudulent': True},
 '500004': {'user_id': '21',
  'transaction_id': '500004',
  'description': 'LORD & TAYLOR',
  'amount': 10.12,
  'x_coordinate': 910.0,
  'y_coordinate': 949.0,
  'is_fraudulent': True},
 '500005': {'user_id': '21',
  'transaction_id': '500005',

## 2. Distance Module

In [3]:
import math

def distance_between_transactions(transaction1, transaction2):
    # Calculate the distance between two transactions belonging to the same user
    x1 = transaction1['x_coordinate']
    y1 = transaction1['y_coordinate']
    x2 = transaction2['x_coordinate']
    y2 = transaction2['y_coordinate']
    distance = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
    return distance


def distance_between_users_transactions(user1_transactions, user2_transactions):
    # Calculate the distance between the transactions of two different users
    distances = []
    for transaction1 in user1_transactions:
        for transaction2 in user2_transactions:
            distance = distance_between_transactions(transaction1, transaction2)
            distances.append(distance)
    return distances


In [4]:
# Assuming you already have a transactions_data dictionary containing the transaction data
# For example, transactions_data = read_transactions_file('transactions/transaction.txt')

# Get all the transactions belonging to user1
user1_transactions = [transaction_data for transaction_data in transactions_data.values() if transaction_data['user_id'] == '21']

# Get all the transactions belonging to user2
user2_transactions = [transaction_data for transaction_data in transactions_data.values() if transaction_data['user_id'] == '22']

# Calculate the distance between two transactions of user1
distance = distance_between_transactions(user1_transactions[0], user1_transactions[1])

# Calculate the distances between the transactions of user1 and user2
distances = distance_between_users_transactions(user1_transactions, user2_transactions)



In [5]:
print('The distance between the first two transactions of user1 is: {}'.format(distance))
print('The distances between the transactions of user1 and user2 are: {}'.format(distances))

The distance between the first two transactions of user1 is: 172.72232050317064
The distances between the transactions of user1 and user2 are: [197.9520143873257, 86.33075929238663, 106.53168542738823, 238.83885781003056, 61.326992425847855, 156.02884348735012, 181.37254478007415, 260.71632093139084, 111.16204388189342, 130.23056476879765, 47.50789408087881, 111.83022847155415, 56.00892785976178, 33.24154027718932, 261.5893728728291, 125.29964086141668, 70.00714249274856, 185.5693940282179, 143.69411957348845, 120.70211265756701, 73.06161783043132, 71.50524456289902, 149.11069713471264, 107.35455276791944, 202.89159667172024, 130.06152390311286, 45.09988913511872, 178.39843048636948, 241.49741199441456, 96.08329719571451, 255.8378392654222, 209.40391591371923, 196.16319736382766, 251.66843266488547, 88.61715409558128, 127.57742747053649, 202.7732724004818, 130.29965464267354, 175.60182231400674, 141.8731828077456, 185.776747737708, 194.99743588057768, 47.29693436154187, 267.15164233071

## 3. statistics_module
You will implement a statistics_module. This module will contain the following functions:

a) A function that returns the average transactions of any user and of all users.

b) A function that returns the mode of transactions of any user and that of all users.

c) A function that returns the median of all transactions of a user and that of all users.

d) A function that returns the interquartile range of any user’s transactions and of all users.

e) A function that returns the location centroid of any user, based on their transaction locations.

f) A function that computes the standard deviation of any specific user’s transactions.

g) A function that determines whether a transaction is fraudulent or not. It should provide details of such transactions.

h) A function that returns an abnormal transaction for any given user.

i) A function that computes the Z score of any user’s transactions and for all users’ transactions.

j) A function that computes those frequencies of transactions at any given location.

k) A function that returns the outlier of any location and of any user.

l) A function that returns the nth percentiles of transactions of any user and of all users

## a) A function that returns the average transactions of any user and of all users.

In [6]:
def average_transactions(transaction_data, user_id=None):
    """
    Returns the average transactions of any user and of all users.
    If user_id is not None, returns the average transactions of the specified user.
    """
    if user_id is not None:
        transactions = [t['amount'] for t in transaction_data.values() if t['user_id'] == user_id]
    else:
        transactions = [t['amount'] for t in transaction_data.values()]
    return sum(transactions) / len(transactions)


## b) A function that returns the mode of transactions of any user and that of all users.


In [7]:
def mode_transactions(user_transactions, all_transactions):
    # Function to compute the mode of transactions of a specific user and of all users
    
    user_amounts = [transaction['amount'] for transaction in user_transactions]
    all_amounts = [transaction['amount'] for transaction in all_transactions]
    
    user_mode = max(set(user_amounts), key=user_amounts.count)
    all_mode = max(set(all_amounts), key=all_amounts.count)
    
    return {'user_mode': user_mode, 'all_mode': all_mode}

## c) A function that returns the median of all transactions of a user and that of all users.


In [8]:
def median_transactions(user_transactions, all_transactions):
    # Function to compute the median of transactions of a specific user and of all users
    
    user_amounts = sorted([transaction['amount'] for transaction in user_transactions])
    all_amounts = sorted([transaction['amount'] for transaction in all_transactions])
    
    user_len = len(user_amounts)
    all_len = len(all_amounts)
    
    if user_len % 2 == 0:
        user_median = (user_amounts[user_len//2] + user_amounts[user_len//2 - 1])/2
    else:
        user_median = user_amounts[user_len//2]
        
    if all_len % 2 == 0:
        all_median = (all_amounts[all_len//2] + all_amounts[all_len//2 - 1])/2
    else:
        all_median = all_amounts[all_len//2]
    
    return {'user_median': user_median, 'all_median': all_median}



## d) A function that returns the interquartile range of any user’s transactions and of all users.


In [9]:
def interquartile_range(user_transactions, all_transactions):
    # Function to compute the interquartile range of a specific user's transactions and of all users
    
    user_amounts = sorted([transaction['amount'] for transaction in user_transactions])
    all_amounts = sorted([transaction['amount'] for transaction in all_transactions])
    
    user_len = len(user_amounts)
    all_len = len(all_amounts)
    
    user_q1 = user_amounts[user_len//4]
    user_q3 = user_amounts[3*user_len//4]
    user_iqr = user_q3 - user_q1
    
    all_q1 = all_amounts[all_len//4]
    all_q3 = all_amounts[3*all_len//4]
    all_iqr = all_q3 - all_q1
    
    return {'user_iqr': user_iqr, 'all_iqr': all_iqr}



## e) A function that returns the location centroid of any user, based on their transaction locations.


In [10]:
def location_centroid(user_transactions):
    # Function to compute the location centroid of a specific user, based on their transaction locations
    
    user_x = [transaction['x_coordinate'] for transaction in user_transactions]
    user_y = [transaction['y_coordinate'] for transaction in user_transactions]
    
    user_centroid_x = sum(user_x)/len(user_x)
    user_centroid_y = sum(user_y)/len(user_y)
    
    return {'user_centroid': (user_centroid_x, user_centroid_y)}



## f) A function that computes the standard deviation of any specific user’s transactions.


In [11]:
import numpy as np
def compute_std_dev(user_id, transactions):
    # Get a list of transaction amounts for the user
    amounts = [t['amount'] for t in transactions.values() if t['user_id'] == user_id]
    # Compute the standard deviation of the transaction amounts
    std_dev = math.sqrt(sum((x - np.mean(amounts)) ** 2 for x in amounts) / len(amounts))
    return std_dev


## g) A function that determines whether a transaction is fraudulent or not. It should provide details of such transactions.


In [12]:
def is_fraudulent(transaction_data, transaction_id):
    transaction = transaction_data.get(transaction_id)
    if not transaction:
        raise ValueError(f"Transaction ID {transaction_id} not found.")
    
    user_transactions = [t for t in transaction_data.values() if t['user_id'] == transaction['user_id']]
    amounts = [t['amount'] for t in user_transactions]
    amount_mean = np.mean(amounts)
    amount_std = np.std(amounts)
    
    x = transaction['x_coordinate']
    y = transaction['y_coordinate']
    locations = [(t['x_coordinate'], t['y_coordinate']) for t in transaction_data.values()]
    centroid = np.mean(locations, axis=0)
    location_std = np.std(locations, axis=0)
    
    if (transaction['amount'] - amount_mean) > (3 * amount_std):
        return True, transaction
    elif np.abs((x - centroid[0])/location_std[0]) > 2 or np.abs((y - centroid[1])/location_std[1]) > 2:
        return True , transaction
    else:
        return False , transaction


## h) A function that returns an abnormal transaction for any given user.


In [13]:

def get_abnormal_transaction(user_id):
    transactions = []
    for transaction_id in transactions_data.keys():
        if transactions_data[transaction_id]['user_id'] == user_id:
            transactions.append(transactions_data[transaction_id]['amount'])
    q1, q3 = np.percentile(transactions, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    abnormal_transactions = []
    for transaction_id in transactions_data.keys():
        if transactions_data[transaction_id]['user_id'] == user_id and (transactions_data[transaction_id]['amount'] < lower_bound or transactions_data[transaction_id]['amount'] > upper_bound):
            abnormal_transactions.append(transactions_data[transaction_id])
    return abnormal_transactions

## i) A function that computes the Z score of any user’s transactions and for all users’ transactions.


In [14]:
def compute_z_score(user_id=None):
    transactions = transactions_data.values()
    if user_id:
        user_transactions = [t for t in transactions if t['user_id'] == user_id]
        amounts = [t['amount'] for t in user_transactions]
        mean = np.mean(amounts)
        std = np.std(amounts)
        z_scores = [(t['transaction_id'], (t['amount'] - mean) / std) for t in user_transactions]
    else:
        amounts = [t['amount'] for t in transactions]
        mean = np.mean(amounts)
        std = np.std(amounts)
        z_scores = [(t['transaction_id'], (t['amount'] - mean) / std) for t in transactions]
    return z_scores


## j) A function that computes those frequencies of transactions at any given location.


In [15]:
def compute_location_frequency(x, y):
    transactions = transactions_data.values()
    location_transactions = [t for t in transactions if t['x_coordinate'] == x and t['y_coordinate'] == y]
    frequency = len(location_transactions) / len(transactions)
    return frequency


## k) A function that returns the outlier of any location and of any user.


In [16]:
def find_outliers(user_id, transactions):
    # Get a list of transaction locations for the user
    locations = [(t['x_coordinate'], t['y_coordinate']) for t in transactions.values() if t['user_id'] == user_id]
    # Compute the centroid of the user's transaction locations
    centroid = (sum([x[0] for x in locations]) / len(locations), sum([x[1] for x in locations]) / len(locations))
    # Compute the distances of each transaction from the centroid
    distances = [math.sqrt((x[0] - centroid[0]) ** 2 + (x[1] - centroid[1]) ** 2) for x in locations]
    # Compute the median and interquartile range of the distances
    median = np.median(distances)
    q1 = np.percentile(distances, 25)
    q3 = np.percentile(distances, 75)
    iqr = q3 - q1
    # Identify outliers as any transactions more than 1.5 times the interquartile range from the median
    outliers = [t for t in transactions.values() if t['user_id'] == user_id and math.sqrt((t['x_coordinate'] - centroid[0]) ** 2 + (t['y_coordinate'] - centroid[1]) ** 2) > median + 1.5 * iqr]
    return outliers



## l) A function that returns the nth percentiles of transactions of any user and of all users

In [17]:
def compute_percentiles(user_id, transactions, percentiles):
    # Get a list of transaction amounts for the user
    amounts = [t['amount'] for t in transactions.values() if t['user_id'] == user_id]
    # Compute the specified percentiles of the transaction amounts
    percentiles_dict = {}
    for p in percentiles:
        percentiles_dict[p] = np.percentile(amounts, p * 100)
    return percentiles_dict


In [18]:
print('average_transactions', average_transactions(transactions_data, "21"))
print("mode of transaction", mode_transactions(user1_transactions, transactions_data.values()))
print("median of transaction", median_transactions(user1_transactions, transactions_data.values()))
print("compute_std_dev", compute_std_dev("21", transactions_data))
print("find_abnormal_transaction", is_fraudulent(transactions_data, "500105"))
print("get_abnormal_transaction", get_abnormal_transaction("21"))
print("compute_z_score", compute_z_score("21"))
print("compute_location_frequency", compute_location_frequency(133, 191.0))
print("find_outliers", find_outliers("21", transactions_data))
print("compute_percentiles", compute_percentiles("21", transactions_data, [0.25, 0.5, 0.75]))



average_transactions 408.64609090909096
mode of transaction {'user_mode': 3.43, 'all_mode': 38.85}
median of transaction {'user_median': 54.745, 'all_median': 92.69999999999999}
compute_std_dev 1129.6828600429055
find_abnormal_transaction (True, {'user_id': '21', 'transaction_id': '500105', 'description': 'HOTEL PLEASURE', 'amount': 3976.08, 'x_coordinate': 133.0, 'y_coordinate': 191.0, 'is_fraudulent': True})
get_abnormal_transaction [{'user_id': '21', 'transaction_id': '500100', 'description': 'HOTEL PLEASURE', 'amount': 4042.34, 'x_coordinate': 189.0, 'y_coordinate': 81.0, 'is_fraudulent': True}, {'user_id': '21', 'transaction_id': '500101', 'description': 'CAFE QWERTY', 'amount': 3949.97, 'x_coordinate': 68.0, 'y_coordinate': 154.0, 'is_fraudulent': True}, {'user_id': '21', 'transaction_id': '500102', 'description': 'whole flash', 'amount': 4134.79, 'x_coordinate': 191.0, 'y_coordinate': 96.0, 'is_fraudulent': True}, {'user_id': '21', 'transaction_id': '500103', 'description': 'AMA