# IMPORT DATA, PACKAGES, AND FUNCTIONS

## import the relevant functions and packages

In [1]:
import pandas as pd
import numpy as np

## download the OFAC list from the web

In [2]:
ofac_list_download = pd.read_csv('https://www.treasury.gov/ofac/downloads/sdn.csv', header=None)

In [3]:
ofac_list = ofac_list_download[[0,1,2]]
ofac_list.columns = ['uid', 'name', 'entity_type']
ofac_list = ofac_list.dropna()
ofac_list

Unnamed: 0,uid,name,entity_type
0,36,AEROCARIBBEAN AIRLINES,-0-
1,173,"ANGLO-CARIBBEAN CO., LTD.",-0-
2,306,BANCO NACIONAL DE CUBA,-0-
3,424,BOUTIQUE LA MAISON,-0-
4,475,CASA DE CUBA,-0-
...,...,...,...
11029,39709,BUSHEHR PRISON,-0-
11030,39710,"OSTAD, Mohammad Reza",individual
11031,39711,"MIRHEYDARY, Mohammad Reza",individual
11032,39783,"LATORTUE, Youri",individual


In [4]:
final_test_cases = pd.read_csv('final_test_cases.csv', encoding = 'gbk')
final_test_cases

Unnamed: 0.1,Unnamed: 0,UID,Theme,Category,Sub-category,Entity-Type,Test Case ID,OFAC List UID,Original Name,Test Case Name
0,0,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 217,6936,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA
1,1,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 6782,26214,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP."
2,2,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 5237,22284,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY
3,3,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 2858,15188,RANCHO LA HERRADURA,RANCHO LA HERRADURA
4,4,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 7239,27233,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A."
...,...,...,...,...,...,...,...,...,...,...
4622,814,UID-541,Name Additions,Name Part in the Middle,2 name parts added,Vessel,UID-541 - 4411,19589,CHONG RIM 2,CHONG RIM priority cushion 2
4623,815,UID-541,Name Additions,Name Part in the Middle,2 name parts added,Vessel,UID-541 - 5475,23162,KUM SONG 7,KUM reservists transmittal SONG 7
4624,816,UID-541,Name Additions,Name Part in the Middle,2 name parts added,Vessel,UID-541 - 5476,23163,KUM UN SAN 3,KUM seesaw UN SAN buckles 3
4625,817,UID-541,Name Additions,Name Part in the Middle,2 name parts added,Vessel,UID-541 - 5787,23733,SAM JONG 1,SAM energizer comment JONG 1


In [5]:
final_test_cases = final_test_cases[['Original Name', 'Test Case Name']]
final_test_cases['Test Type'] = 'Fuzzy Match'
final_test_cases

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_test_cases['Test Type'] = 'Fuzzy Match'


Unnamed: 0,Original Name,Test Case Name,Test Type
0,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA,Fuzzy Match
1,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP.",Fuzzy Match
2,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY,Fuzzy Match
3,RANCHO LA HERRADURA,RANCHO LA HERRADURA,Fuzzy Match
4,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A.",Fuzzy Match
...,...,...,...
4622,CHONG RIM 2,CHONG RIM priority cushion 2,Fuzzy Match
4623,KUM SONG 7,KUM reservists transmittal SONG 7,Fuzzy Match
4624,KUM UN SAN 3,KUM seesaw UN SAN buckles 3,Fuzzy Match
4625,SAM JONG 1,SAM energizer comment JONG 1,Fuzzy Match


# FILTER FOR THE REQUIREMENTS OF THE TEST CASE TYPE

## randomly choose 4627 rows

In [6]:
ofac_list_sampled = ofac_list.sample(n = final_test_cases.shape[0]).reset_index(drop = True)
ofac_list_sampled

Unnamed: 0,uid,name,entity_type
0,30702,KGB ALPHA,-0-
1,21914,"ISMA'IL, Ayham",individual
2,19941,MUNSA INTERNATIONAL INVESMENTS S.A.,-0-
3,24750,EKT SMART TECHNOLOGY,-0-
4,19188,"INMOBILIARIA MULTI-TIENDAS, S.A.",-0-
...,...,...,...
4622,11601,"ALTUN, Ali Riza",individual
4623,30552,"HASNAIN, Syed Johar",individual
4624,33997,FARMA IZVORI B.I.,-0-
4625,10593,"LOAN, Waseem Rauf",individual


# CREATE THE EXACT MATCH TEST CASES

## create blank test cases table

In [7]:
exact_match_test_cases = pd.DataFrame(columns=['Original Name','Test Case Name', 'Test Type'])
exact_match_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type


In [8]:
exact_match_test_cases['Original Name'] = ofac_list_sampled['name']
exact_match_test_cases['Test Case Name'] = ofac_list_sampled['name']
exact_match_test_cases['Test Type'] = 'Exact Match'
exact_match_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type
0,KGB ALPHA,KGB ALPHA,Exact Match
1,"ISMA'IL, Ayham","ISMA'IL, Ayham",Exact Match
2,MUNSA INTERNATIONAL INVESMENTS S.A.,MUNSA INTERNATIONAL INVESMENTS S.A.,Exact Match
3,EKT SMART TECHNOLOGY,EKT SMART TECHNOLOGY,Exact Match
4,"INMOBILIARIA MULTI-TIENDAS, S.A.","INMOBILIARIA MULTI-TIENDAS, S.A.",Exact Match
...,...,...,...
4622,"ALTUN, Ali Riza","ALTUN, Ali Riza",Exact Match
4623,"HASNAIN, Syed Johar","HASNAIN, Syed Johar",Exact Match
4624,FARMA IZVORI B.I.,FARMA IZVORI B.I.,Exact Match
4625,"LOAN, Waseem Rauf","LOAN, Waseem Rauf",Exact Match


## concat match test cases

In [9]:
final_test_cases = pd.concat([final_test_cases, exact_match_test_cases])
final_test_cases['Label'] = 'Match'
final_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type,Label
0,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA,Fuzzy Match,Match
1,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP.",Fuzzy Match,Match
2,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY,Fuzzy Match,Match
3,RANCHO LA HERRADURA,RANCHO LA HERRADURA,Fuzzy Match,Match
4,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A.",Fuzzy Match,Match
...,...,...,...,...
4622,"ALTUN, Ali Riza","ALTUN, Ali Riza",Exact Match,Match
4623,"HASNAIN, Syed Johar","HASNAIN, Syed Johar",Exact Match,Match
4624,FARMA IZVORI B.I.,FARMA IZVORI B.I.,Exact Match,Match
4625,"LOAN, Waseem Rauf","LOAN, Waseem Rauf",Exact Match,Match


# CREATE THE NON-MATCH TEST CASES

## create blank test cases table

In [10]:
non_match_test_cases = pd.DataFrame(columns=['Original Name','Test Case Name'])
non_match_test_cases

Unnamed: 0,Original Name,Test Case Name


## run loop to generate the test cases

In [11]:
ofac_list_sampled = ofac_list.sample(n = final_test_cases.shape[0]).reset_index(drop = True)

for index, row in ofac_list_sampled.iterrows():
    original_name = row['name']

    final_test_name = np.random.choice(ofac_list_sampled['name'])
    while (final_test_name == original_name):
        final_test_name = np.random.choice(ofac_list_sampled['name'])
    
    non_match_test_cases.loc[len(non_match_test_cases)] = [row['name'], final_test_name] # append to the dataframe

non_match_test_cases['Test Type'] = 'Non-Match'
non_match_test_cases['Label'] = 'Not Match'
non_match_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type,Label
0,"VALDEZ BENITES, Joel","AL-ASSAD, Karam",Non-Match,Not Match
1,CARLOTA C,"SCUADRA FORTIA, S.A. DE C.V.",Non-Match,Not Match
2,RAMENSKOYE DESIGN COMPANY JOINT STOCK COMPANY,AMD CO. LTD AGENCY,Non-Match,Not Match
3,"NAJIB, Ahmad","WAKED HATUM, Nidal Ahmed",Non-Match,Not Match
4,"GLEBOVA, Lyubov Nikolayevna","AKHAEI, Shaghayegh",Non-Match,Not Match
...,...,...,...,...
9249,GLOBAL AGE LIMITED,"SHAABAN, Bouthaina",Non-Match,Not Match
9250,OBSHCHESTVO S OGRANICHENNOI OTVETSTVENNOSTYU M...,TAMIN KALAYE SABZ ARAS COMPANY,Non-Match,Not Match
9251,"AHMED, Qassim Abdullah Ali",KOREA HAEGUMGANG TRADING CORPORATION,Non-Match,Not Match
9252,JINGHO TECHNOLOGY CO. LIMITED,COMITE' DE BIENFAISANCE ET DE SECOURS AUX PALE...,Non-Match,Not Match


# CONCAT FINAL TEST CASES

In [12]:
final_test_cases = pd.concat([final_test_cases, non_match_test_cases])
final_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type,Label
0,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA,Fuzzy Match,Match
1,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP.",Fuzzy Match,Match
2,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY,Fuzzy Match,Match
3,RANCHO LA HERRADURA,RANCHO LA HERRADURA,Fuzzy Match,Match
4,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A.",Fuzzy Match,Match
...,...,...,...,...
9249,GLOBAL AGE LIMITED,"SHAABAN, Bouthaina",Non-Match,Not Match
9250,OBSHCHESTVO S OGRANICHENNOI OTVETSTVENNOSTYU M...,TAMIN KALAYE SABZ ARAS COMPANY,Non-Match,Not Match
9251,"AHMED, Qassim Abdullah Ali",KOREA HAEGUMGANG TRADING CORPORATION,Non-Match,Not Match
9252,JINGHO TECHNOLOGY CO. LIMITED,COMITE' DE BIENFAISANCE ET DE SECOURS AUX PALE...,Non-Match,Not Match


In [13]:
final_test_cases['Label'].value_counts()

Match        9254
Not Match    9254
Name: Label, dtype: int64

In [14]:
final_test_cases['Test Type'].value_counts()

Non-Match      9254
Fuzzy Match    4627
Exact Match    4627
Name: Test Type, dtype: int64

In [15]:
final_test_cases.to_csv('ml_dataset.csv', index = False)