# IMPORT DATA, PACKAGES, AND FUNCTIONS

## import the relevant functions and packages

In [1]:
import pandas as pd
import numpy as np

## download the OFAC list from the web

In [2]:
ofac_list_download = pd.read_csv('https://www.treasury.gov/ofac/downloads/sdn.csv', header=None)

In [3]:
ofac_list = ofac_list_download[[0,1,2]]
ofac_list.columns = ['uid', 'name', 'entity_type']

In [4]:
final_test_cases = pd.read_csv('final_test_cases.csv')
final_test_cases

Unnamed: 0.1,Unnamed: 0,UID,Theme,Category,Sub-category,Entity-Type,Test Case ID,OFAC List UID,Original Name,Test Case Name
0,0,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 217,6936,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA
1,1,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 6782,26214,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP."
2,2,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 5237,22284,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY
3,3,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 2858,15188,RANCHO LA HERRADURA,RANCHO LA HERRADURA
4,4,UID-3,Positive Control,Exact Match,100% true match,Entity,UID-3 - 7239,27233,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A."
...,...,...,...,...,...,...,...,...,...,...
4622,814,UID-541,Name Additions,Name Part in the Middle,2 name parts added,Vessel,UID-541 - 4411,19589,CHONG RIM 2,CHONG RIM priority cushion 2
4623,815,UID-541,Name Additions,Name Part in the Middle,2 name parts added,Vessel,UID-541 - 5475,23162,KUM SONG 7,KUM reservists transmittal SONG 7
4624,816,UID-541,Name Additions,Name Part in the Middle,2 name parts added,Vessel,UID-541 - 5476,23163,KUM UN SAN 3,KUM seesaw UN SAN buckles 3
4625,817,UID-541,Name Additions,Name Part in the Middle,2 name parts added,Vessel,UID-541 - 5787,23733,SAM JONG 1,SAM energizer comment JONG 1


In [5]:
final_test_cases = final_test_cases[['Original Name', 'Test Case Name']]
final_test_cases['Test Type'] = 'Fuzzy Match'
final_test_cases

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_test_cases['Test Type'] = 'Fuzzy Match'


Unnamed: 0,Original Name,Test Case Name,Test Type
0,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA,Fuzzy Match
1,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP.",Fuzzy Match
2,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY,Fuzzy Match
3,RANCHO LA HERRADURA,RANCHO LA HERRADURA,Fuzzy Match
4,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A.",Fuzzy Match
...,...,...,...
4622,CHONG RIM 2,CHONG RIM priority cushion 2,Fuzzy Match
4623,KUM SONG 7,KUM reservists transmittal SONG 7,Fuzzy Match
4624,KUM UN SAN 3,KUM seesaw UN SAN buckles 3,Fuzzy Match
4625,SAM JONG 1,SAM energizer comment JONG 1,Fuzzy Match


# FILTER FOR THE REQUIREMENTS OF THE TEST CASE TYPE

## randomly choose 4627 rows

In [6]:
ofac_list_sampled = ofac_list.sample(n = final_test_cases.shape[0]).reset_index(drop = True)
ofac_list_sampled

Unnamed: 0,uid,name,entity_type
0,20283,AO ABR MANAGEMENT,-0-
1,4708,PALESTINE LIBERATION FRONT - ABU ABBAS FACTION,-0-
2,19586,KOREA OCEAN SHIPPING AGENCY,-0-
3,21169,"MINAEV, Oleg Aleksandrovich",individual
4,11009,AGRICOLA GAXIOLA S.A. DE C.V.,-0-
...,...,...,...
4622,28047,"TRY PHEAP GRAND ROYAL CAMBODIA CO., LTD.",-0-
4623,37111,"KUZNETSOV, Stanislav Konstantinovich",individual
4624,16021,IRAN & SHARGH LEASING COMPANY,-0-
4625,13357,"MOHAMMED, Aboud Rogo",individual


# CREATE THE EXACT MATCH TEST CASES

## create blank test cases table

In [7]:
exact_match_test_cases = pd.DataFrame(columns=['Original Name','Test Case Name', 'Test Type'])
exact_match_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type


In [8]:
exact_match_test_cases['Original Name'] = ofac_list_sampled['name']
exact_match_test_cases['Test Case Name'] = ofac_list_sampled['name']
exact_match_test_cases['Test Type'] = 'Exact Match'
exact_match_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type
0,AO ABR MANAGEMENT,AO ABR MANAGEMENT,Exact Match
1,PALESTINE LIBERATION FRONT - ABU ABBAS FACTION,PALESTINE LIBERATION FRONT - ABU ABBAS FACTION,Exact Match
2,KOREA OCEAN SHIPPING AGENCY,KOREA OCEAN SHIPPING AGENCY,Exact Match
3,"MINAEV, Oleg Aleksandrovich","MINAEV, Oleg Aleksandrovich",Exact Match
4,AGRICOLA GAXIOLA S.A. DE C.V.,AGRICOLA GAXIOLA S.A. DE C.V.,Exact Match
...,...,...,...
4622,"TRY PHEAP GRAND ROYAL CAMBODIA CO., LTD.","TRY PHEAP GRAND ROYAL CAMBODIA CO., LTD.",Exact Match
4623,"KUZNETSOV, Stanislav Konstantinovich","KUZNETSOV, Stanislav Konstantinovich",Exact Match
4624,IRAN & SHARGH LEASING COMPANY,IRAN & SHARGH LEASING COMPANY,Exact Match
4625,"MOHAMMED, Aboud Rogo","MOHAMMED, Aboud Rogo",Exact Match


## concat match test cases

In [9]:
final_test_cases = pd.concat([final_test_cases, exact_match_test_cases])
final_test_cases['Label'] = 'Match'
final_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type,Label
0,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA,Fuzzy Match,Match
1,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP.",Fuzzy Match,Match
2,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY,Fuzzy Match,Match
3,RANCHO LA HERRADURA,RANCHO LA HERRADURA,Fuzzy Match,Match
4,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A.",Fuzzy Match,Match
...,...,...,...,...
4622,"TRY PHEAP GRAND ROYAL CAMBODIA CO., LTD.","TRY PHEAP GRAND ROYAL CAMBODIA CO., LTD.",Exact Match,Match
4623,"KUZNETSOV, Stanislav Konstantinovich","KUZNETSOV, Stanislav Konstantinovich",Exact Match,Match
4624,IRAN & SHARGH LEASING COMPANY,IRAN & SHARGH LEASING COMPANY,Exact Match,Match
4625,"MOHAMMED, Aboud Rogo","MOHAMMED, Aboud Rogo",Exact Match,Match


# CREATE THE NON-MATCH TEST CASES

## create blank test cases table

In [18]:
non_match_test_cases = pd.DataFrame(columns=['Original Name','Test Case Name'])
non_match_test_cases

Unnamed: 0,Original Name,Test Case Name


## run loop to generate the test cases

In [19]:
ofac_list_sampled = ofac_list.sample(n = final_test_cases.shape[0]).reset_index(drop = True)

for index, row in ofac_list_sampled.iterrows():
    original_name = row['name']

    final_test_name = np.random.choice(ofac_list_sampled['name'])
    while final_test_name == original_name:
        final_test_name = np.random.choice(ofac_list_sampled['name'])
    
    non_match_test_cases.loc[len(non_match_test_cases)] = [row['name'], final_test_name] # append to the dataframe

non_match_test_cases['Test Type'] = 'Non-Match'
non_match_test_cases['Label'] = 'Not Match'
non_match_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type,Label
0,SANOVERA PHARM COMPANY SARL,FEDERAL STATE FINANCED INSTITUTION OF SCIENCE ...,Non-Match,Not Match
1,PARSIAN BANK,"CHEMEZOV, Stanislav Sergeyevich",Non-Match,Not Match
2,ELLA TOURISM COMPANY,POTRICO CORP.,Non-Match,Not Match
3,"MAYOROV, Alexei Petrovich",INTERNATIONAL INVESTMENTS HOLDING S.A. DE C.V.,Non-Match,Not Match
4,"ABU MARZOOK, Mousa Mohammed","AL-TIKRITI, Rukan Razuki abd-al-Ghafur Sulaiman",Non-Match,Not Match
...,...,...,...,...
9249,"PASANDIDEH, Heidar","EL HADI, Mustapha Nasri Ben Abdul Kader Ait",Non-Match,Not Match
9250,"ZAYDAN, Mustafa",EP-IEF,Non-Match,Not Match
9251,"AL-RAWI, Mushtaq Talib Zughayr",TOUMEH INTERNATIONAL GROUP,Non-Match,Not Match
9252,"SHAPOVALOV, Oleg Georgievich",FOREST,Non-Match,Not Match


# CONCAT FINAL TEST CASES

In [20]:
final_test_cases = pd.concat([final_test_cases, non_match_test_cases])
final_test_cases

Unnamed: 0,Original Name,Test Case Name,Test Type,Label
0,JAM'YAH TA'AWUN AL-ISLAMIA,JAM'YAH TA'AWUN AL-ISLAMIA,Fuzzy Match,Match
1,"GLOBOVISION TELE CA, CORP.","GLOBOVISION TELE CA, CORP.",Fuzzy Match,Match
2,KOREAN PEOPLE'S ARMY,KOREAN PEOPLE'S ARMY,Fuzzy Match,Match
3,RANCHO LA HERRADURA,RANCHO LA HERRADURA,Fuzzy Match,Match
4,"TECHNO ENERGY, S.A.","TECHNO ENERGY, S.A.",Fuzzy Match,Match
...,...,...,...,...
9249,"PASANDIDEH, Heidar","EL HADI, Mustapha Nasri Ben Abdul Kader Ait",Non-Match,Not Match
9250,"ZAYDAN, Mustafa",EP-IEF,Non-Match,Not Match
9251,"AL-RAWI, Mushtaq Talib Zughayr",TOUMEH INTERNATIONAL GROUP,Non-Match,Not Match
9252,"SHAPOVALOV, Oleg Georgievich",FOREST,Non-Match,Not Match


In [22]:
final_test_cases['Label'].value_counts()

Match        9254
Not Match    9254
Name: Label, dtype: int64

In [23]:
final_test_cases['Test Type'].value_counts()

Non-Match      9254
Fuzzy Match    4627
Exact Match    4627
Name: Test Type, dtype: int64

In [24]:
final_test_cases.to_csv('ml_dataset.csv', index = False)