# IBM Anti-Money-Laundering ETL Project (Team Epsilon)
###### Contributors: Nathon Burwick, Toyin Olaye, Cole Valentyn, Ariel Richardson, Talita Urzeda, Taylor Gibson

In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
import os
import sqlalchemy
from pymongo import MongoClient
from sqlalchemy import Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
import datetime as dt
from pprint import pprint
import re

In [2]:
# Define a read_file function that transforms data into a Pandas DataFrame
def read_file(filename):
    # Establish Column Names
    cols = ['timestamp', 'from_bank', 'from_account','to_bank','to_account',
            'amount_received', 'rec_currency', 'amount_paid', 'payment_currency',
            'payment_format', 'is_laundering', 'pattern_type', 'group_id']
    
    # Read File
    file = open(os.path.join('Resources', filename), 'r')
    file_data = file.readlines()
    file_data = [l.strip() for l in file_data]
    
    # Filter out blank lines
    for i, l in enumerate(file_data):
        if l == '':
            file_data.pop(i)
            
    # Create basis for group_id (file_ref + grouping number) using regex
    file_type = re.findall(r"-(.+)_",filename)[0]
    group_id = 0
    
    # Loop through to add pattern_type & group_id data
    for i, l in enumerate(file_data):
        if re.search('BEGIN', l) != None:
            group_id += 1
            begin_row = i
            pattern_type = re.findall(r"- (.+)",l)[0]
            if ':' in pattern_type:
                pattern_type = pattern_type.split(':')[0]

        elif re.search('END', l) != None:
            end_row = i
            for index in range(begin_row + 1 , end_row):
                file_data[index] = file_data[index] + f",{pattern_type},{file_type}_{group_id}"
    
    # Remove Rows containing BEGIN or END
    remove_list = [i for i, l in enumerate(file_data) if re.search(r"^BEGIN|^END", l) != None]
    
    for index in sorted(remove_list, reverse=True):
        del file_data[index]

    # Use .split() to convert data rows into list of lists
    file_data = [i.split(',') for i in file_data]        
    
    # Convert Data into DataFrame with established column names
    file_df = pd.DataFrame(file_data, columns=cols)
    
    # Establish Data Types
    dtypes_dict = {'timestamp': 'datetime64', 'from_bank': 'string', 'from_account': 'string' ,
                   'to_bank': 'string', 'to_account': 'string', 'amount_received': 'float64', 
                   'rec_currency': 'string', 'amount_paid': 'float64', 'payment_currency': 'string',
                   'payment_format': 'string', 'is_laundering': 'int64', 'pattern_type': 'string', 'group_id': 'string'}
    
    # Convert Data Types
    file_df = file_df.astype(dtypes_dict)
    
    # Return DataFrame to variable
    return file_df

In [3]:
# Read HI-Large_Patterns.txt into DataFrame
hi_large = read_file('HI-Large_Patterns.txt')

# Read HI-Medium_Patterns.txt into DataFrame
hi_med = read_file('HI-Medium_Patterns.txt')

# Read HI-Small_Patterns.txt into DataFrame
hi_small = read_file('HI-Small_Patterns.txt')

In [4]:
# Display Preview of Large DataFrame
hi_large

Unnamed: 0,timestamp,from_bank,from_account,to_bank,to_account,amount_received,rec_currency,amount_paid,payment_currency,payment_format,is_laundering,pattern_type,group_id
0,2022-08-09 05:14:00,00952,8139F54E0,0111632,8062C56E0,5331.44,US Dollar,5331.44,US Dollar,ACH,1,STACK,Large_1
1,2022-08-13 13:09:00,0111632,8062C56E0,008456,81363F620,5602.59,US Dollar,5602.59,US Dollar,ACH,1,STACK,Large_1
2,2022-08-15 07:40:00,0118693,823D5EB90,013729,801CF2E60,1400.54,US Dollar,1400.54,US Dollar,ACH,1,STACK,Large_1
3,2022-08-15 14:19:00,013729,801CF2E60,0123621,81A7090F0,1467.94,US Dollar,1467.94,US Dollar,ACH,1,STACK,Large_1
4,2022-08-13 12:40:00,0024750,81363F410,0213834,808757B00,16898.29,US Dollar,16898.29,US Dollar,ACH,1,STACK,Large_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
137931,2022-11-08 09:12:00,01538,8008279E0,012883,801345490,5698.52,Euro,5698.52,Euro,ACH,1,FAN-OUT,Large_16467
137932,2022-11-20 18:34:00,01538,8008279E0,01226,8001672F0,6043.82,Euro,6043.82,Euro,ACH,1,FAN-OUT,Large_16467
137933,2022-11-25 11:27:00,01538,8008279E0,01213,8006E6420,9483.76,Euro,9483.76,Euro,ACH,1,FAN-OUT,Large_16467
137934,2022-11-28 09:15:00,01538,8008279E0,001827,80117E140,11919.51,Euro,11919.51,Euro,ACH,1,FAN-OUT,Large_16467


In [5]:
# Display Data Types
hi_large.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137936 entries, 0 to 137935
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   timestamp         137936 non-null  datetime64[ns]
 1   from_bank         137936 non-null  string        
 2   from_account      137936 non-null  string        
 3   to_bank           137936 non-null  string        
 4   to_account        137936 non-null  string        
 5   amount_received   137936 non-null  float64       
 6   rec_currency      137936 non-null  string        
 7   amount_paid       137936 non-null  float64       
 8   payment_currency  137936 non-null  string        
 9   payment_format    137936 non-null  string        
 10  is_laundering     137936 non-null  int64         
 11  pattern_type      137936 non-null  string        
 12  group_id          137936 non-null  string        
dtypes: datetime64[ns](1), float64(2), int64(1), string(9)
memor

In [6]:
# Display Preview of Medium DataFrame
hi_med

Unnamed: 0,timestamp,from_bank,from_account,to_bank,to_account,amount_received,rec_currency,amount_paid,payment_currency,payment_format,is_laundering,pattern_type,group_id
0,2022-09-01 05:14:00,00952,8139F54E0,0111632,8062C56E0,5331.44,US Dollar,5331.44,US Dollar,ACH,1,STACK,Medium_1
1,2022-09-03 13:09:00,0111632,8062C56E0,008456,81363F620,5602.59,US Dollar,5602.59,US Dollar,ACH,1,STACK,Medium_1
2,2022-09-01 07:40:00,0118693,823D5EB90,013729,801CF2E60,1400.54,US Dollar,1400.54,US Dollar,ACH,1,STACK,Medium_1
3,2022-09-01 14:19:00,013729,801CF2E60,0123621,81A7090F0,1467.94,US Dollar,1467.94,US Dollar,ACH,1,STACK,Medium_1
4,2022-09-02 12:40:00,0024750,81363F410,0213834,808757B00,16898.29,US Dollar,16898.29,US Dollar,ACH,1,STACK,Medium_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22738,2022-09-16 23:44:00,01818,80D7BB990,024842,8028966B0,4570.11,US Dollar,4570.11,US Dollar,ACH,1,CYCLE,Medium_2756
22739,2022-09-17 03:20:00,024842,8028966B0,0172269,82A986A10,4410.67,US Dollar,4410.67,US Dollar,ACH,1,CYCLE,Medium_2756
22740,2022-09-19 17:12:00,0172269,82A986A10,025644,81DB33B40,4410.67,US Dollar,4410.67,US Dollar,ACH,1,CYCLE,Medium_2756
22741,2022-09-20 16:18:00,025644,81DB33B40,0295233,8230CE9C0,3414.74,UK Pound,3414.74,UK Pound,ACH,1,CYCLE,Medium_2756


In [7]:
# Display Preview of Small DataFrame
hi_small

Unnamed: 0,timestamp,from_bank,from_account,to_bank,to_account,amount_received,rec_currency,amount_paid,payment_currency,payment_format,is_laundering,pattern_type,group_id
0,2022-09-01 00:06:00,021174,800737690,012,80011F990,2848.96,Euro,2848.96,Euro,ACH,1,FAN-OUT,Small_1
1,2022-09-01 04:33:00,021174,800737690,020,80020C5B0,8630.40,Euro,8630.40,Euro,ACH,1,FAN-OUT,Small_1
2,2022-09-01 09:14:00,021174,800737690,020,80006A5E0,35642.49,Yuan,35642.49,Yuan,ACH,1,FAN-OUT,Small_1
3,2022-09-01 09:56:00,021174,800737690,00220,8007A5B70,5738987.96,US Dollar,5738987.96,US Dollar,ACH,1,FAN-OUT,Small_1
4,2022-09-01 11:28:00,021174,800737690,001244,80093C0D0,7254.53,US Dollar,7254.53,US Dollar,ACH,1,FAN-OUT,Small_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3204,2022-09-14 14:10:00,028827,806DEA990,0041407,80F454910,14141.57,Swiss Franc,14141.57,Swiss Franc,ACH,1,FAN-IN,Small_368
3205,2022-09-14 15:49:00,016643,80762FE70,0041407,80F454910,2860.78,Swiss Franc,2860.78,Swiss Franc,ACH,1,FAN-IN,Small_368
3206,2022-09-14 19:51:00,0013917,80E16D0B0,0041407,80F454910,7413.71,Swiss Franc,7413.71,Swiss Franc,ACH,1,FAN-IN,Small_368
3207,2022-09-10 22:42:00,0049365,812A09D40,0119,812A09CF0,41927.21,Saudi Riyal,41927.21,Saudi Riyal,ACH,1,RANDOM,Small_369
