In [191]:
import pandas as pd
import requests
import json
from datetime import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [192]:
def name_match(name, sdn_name, match_type): # 4 match types: 'EXACT', 'STRONG', 'MEDIUM', 'LIGHT'
    'function takes 3 args, 2 different names and a match type, and will return a bool based on match criteria'
    types = ['EXACT', 'STRONG', 'MEDIUM', 'LIGHT']
    if match_type not in types:
        print('ERROR: match_type does not exist - Only "EXACT", "STRONG", "MEDIUM", "LIGHT"')
        return []
    else:
        exact_match = fuzz.ratio(name, sdn_name)
        partial_match = fuzz.partial_ratio(name, sdn_name)
        # EXACT MATCH
        if match_type == types[0]:
            if exact_match == 100:
                return True
            else:
                return False
        
        # STRONG MATCH
        elif match_type == types[1]:
            if partial_match >= 85:
                return True
            else:
                return False
        
        # MEDIUM MATCH
        elif match_type == types[2]:
            if partial_match >= 70:
                return True
            else:
                return False
        
        # LIGHT MATCH
        else:
            if partial_match >= 55:
                return True
            else:
                return False

In [193]:
def indiv_name_match(last_name, first_name, dataframe, match_type):
    # Check Last Name matches
    last_name_matches = []
    count = 0
    while count < len(dataframe):
        is_match = name_match(last_name, dataframe['Last Name'][count], match_type)
        if is_match == True:
            last_name_matches.append(dataframe['#'][count])
        count += 1
    # Check First Name matches on Filtered DataFrame (with only Last Name matches in it)
    first_name_matches = []
    last_name_match_df = dataframe[dataframe['#'].isin(last_name_matches)]
    last_name_match_df.reset_index(inplace=True)
    count = 0
    while count < len(last_name_match_df):
        is_match = name_match(first_name, last_name_match_df['First Name'][count], match_type)
        if is_match == True:
            first_name_matches.append(last_name_match_df['#'][count])
        count += 1
    # Check if there are any name matches, if there are, now check whether the DOB matches
    both_name_match_df = last_name_match_df[last_name_match_df['#'].isin(first_name_matches)]
    return both_name_match_df

In [194]:
def DOB_match(indiv_dob, sdn_dob):
    'function takes 2 args, 2 different DOBs with possibly different formats and returns bool based on match'
    indiv_dob = datetime.strptime(indiv_dob, "%d %b %Y") # convert individual's DOB into datetime format
    sdn_dob_len = len(sdn_dob) # Length of SDN DOB, will help us understand how to convert to datetime
    months = {'JAN': [1, 31], 'FEB': [2, 28], 'MAR': [3, 31], 'APR': [4, 30], 'MAY': [5, 31], 'JUN': [6, 30],
              'JUL': [7, 31], 'AUG': [8, 31], 'SEP': [9, 30], 'OCT': [10, 31], 'NOV': [11, 30], 'DEC': [12, 31]}
    months_num = {'1': 31, '2': 28, '3': 31, '4': 30, '5': 31, '6': 30, 
                  '7': 31, '8': 31, '9': 30, '10': 31, '11': 30, '12': 31}
    
    if sdn_dob == '-0-':
        # Return "MATCH" or True, because the SDN List individual does not have DOB accessible
        print('NOTE: No DOB on SDN List for this Individual')
        return True
    else:
        # SDN List DOB is not -0-/blank and we must check for all the variations of DOBs listed
        if 'circa' not in sdn_dob: # Exact DOB (not a range)
            if sdn_dob_len == 4:
                # sdn_dob Format is '1999'
                if indiv_dob >= datetime(int(sdn_dob), 1, 1) and indiv_dob <= datetime(int(sdn_dob), 12, 31):
                    return True
                else:
                    return False
            elif sdn_dob_len == 8:
                # sdn_dob Format is 'Jan 1999'
                start = datetime(int(sdn_dob.split()[1]), months[sdn_dob.split()[1]][0], 1) # Start of month
                end = datetime(int(sdn_dob.split()[1]), 
                               months[sdn_dob.split()[1]][0], 
                               months[sdn_dob.split()[1]][1],) # End of month
                if indiv_dob >= start and indiv_dob <= end:
                    return True
                else:
                    return False
            elif sdn_dob_len == 11:
                # sdn_dob Format is '01 Jan 1999'
                if indiv_dob == datetime.strptime(sdn_dob, "%d %b %Y"):
                    return True
                else:
                    return False
            elif sdn_dob_len == 12:
                # sdn_dob Format is '1998 to 1999'
                if indiv_dob >= datetime(int(sdn_dob.split()[0]), 1, 1) and indiv_dob <= datetime(int(sdn_dob.split()[2]), 12, 31):
                    return True
                else:
                    return False
            elif sdn_dob_len == 26:
                # sdn_dob Format is '01 Jan 1999 to 31 Dec 1999'
                start = datetime.strptime(sdn_dob.split(' to ')[0], "%d %b %Y")
                end = datetime.strptime(sdn_dob.split(' to ')[1], "%d %b %Y")
                if indiv_dob >= start and indiv_dob <= end:
                    return True
                else:
                    return False
            else:
                print('WARNING: Not an expected DOB Format from SDN List --> {}'.format(sdn_dob))
                return True # For now, if there are any other DOB formats, just return True instead of False
        else: # 'circa' is in SDN DOB and now we have to check against wider range for DOB
            if sdn_dob_len == 10:
                # sdn_dob Format is 'circa 1999'
                # check if Individual DOB is within sdn_dob +/- 3 years
                start = datetime(int(int(sdn_dob.split()[1])-3), 1, 1) # sdn_dob Year - 3
                end = datetime(int(int(sdn_dob.split()[1])+3), 12, 31) # sdn_dob Year + 3
                if indiv_dob >= start and indiv_dob <= end:
                    return True
                else:
                    return False
            elif sdn_dob_len == 17:
                # sdn_dob Format is 'circa 01 Jan 1999'
                # check if Individual DOB is within sdn_dob +/- 3 months
                sdn_dob = sdn_dob.split('circa ')[1]
                month = sdn_dob.split()[1].upper()
                if month in ['JAN', 'FEB', 'MAR']: # Minus 3 months puts into last year
                    start = datetime((int(sdn_dob.split()[2]) - 1), (months[month][0] + 9), 1)
                else:
                    start = datetime(int(sdn_dob.split()[2]), (months[month][0] - 3), 1)
                if month in ['OCT', 'NOV', 'DEC']: # Add 3 months puts it into next year
                    end = datetime((int(sdn_dob.split()[2]) + 1), 
                                   (months[month][0] - 9), 
                                   months_num[str(int((months[month][0] - 9)))])
                else:
                    end = datetime(int(sdn_dob.split()[2]), 
                                   (months[month][0] + 3), 
                                   months_num[str(int((months[month][0] + 3)))])
                if indiv_dob >= start and indiv_dob <= end:
                    return True
                else:
                    return False
            elif sdn_dob_len == 15:
                # sdn_dob Format is 'circa 1979-1982'
                # check if Individual DOB is within sdn_dob +/- 3 years of circa range
                start, end = sdn_dob.split()[1].split('-') # split 'circa 1979-1982'
                start = datetime((int(start) - 3), 1, 1)
                end = datetime((int(end) + 3), 12, 31)
                if indiv_dob >= start and indiv_dob <= end:
                    return True
                else:
                    return False
            else:
                print('WARNING: Not an expected DOB Format from SDN List --> {}'.format(sdn_dob))
                return True # For now, if there are any other DOB formats, just return True instead of False

In [195]:
def get_individual_SDN():
        
        # Use Requests library to scrape web-hosted SDN List, store in string var
        r = requests.get('https://www.treasury.gov/ofac/downloads/sdn.pip')
        lines = str(r.text).split("\r\n")
        
        # Take string variable and split it into list, based on pip formatting
        matrix = []
        for line in lines:
            entry = line.split("|")
            matrix.append(entry)
            
        # Create initial Individual Lists
        individual = []
        count = 0
        while count < len(matrix):
            if len(matrix[count]) > 1:
                if matrix[count][2] == '"individual"':
                    individual.append(matrix[count])
            count += 1
            
        # Create List and then DataFrame for Individuals on SDN List

        # Create List
        f_ind = []
        for entry in individual:
            try:
                last_name = entry[1].split(',')[0].strip('"')
                first_name = entry[1].split(',')[1].strip('"')
            except IndexError as e:
                last_name = entry[1].split(',')[0].strip('"')
                first_name = ""
            f_ind.append([
                int(entry[0]), 
                last_name,
                first_name[1:],
                entry[2].strip('"'),
                entry[3].strip('"'),
                entry[4].strip('"'),
                entry[11].strip('"')
            ])
            
        # Create DataFrame and add column which includes isolated DOB where possible
        df_ind = pd.DataFrame(f_ind,
                      columns=["#", "Last Name", "First Name", 
                               "Ind/Entity", "Global Tag", "Note", "Extra"])
        # Initialize new column for DOB
        df_ind['DOB'] = '-0-'
        
        # Loop thru Extra column & split string based on DOB, assign DOB to DOB column
        count = 0
        while count < len(df_ind):
            if 'DOB' in df_ind['Extra'][count]:
                dob = df_ind['Extra'][count][df_ind['Extra'][count].find('DOB')+4:]
                dob = dob[:dob.find(';')]
                df_ind.loc[count, 'DOB'] = dob
            count += 1
            
        return df_ind

In [196]:
def get_entity_SDN():
        
        # Use Requests library to scrape web-hosted SDN List, store in string var
        r = requests.get('https://www.treasury.gov/ofac/downloads/sdn.pip')
        lines = str(r.text).split("\r\n")
        
        # Take string variable and split it into list, based on pip formatting
        matrix = []
        for line in lines:
            entry = line.split("|")
            matrix.append(entry)
            
        # Create initial Entity Lists
        entity = []
        count = 0
        while count < len(matrix):
            if len(matrix[count]) > 1:
                if matrix[count][2] != '"individual"':
                    entity.append(matrix[count])
            count += 1
        
        # Create List and then DataFrame for Entities on SDN List
        f_entity = [] # f for final list for corp
        for entry in entity:
            f_entity.append([
                int(entry[0]), 
                entry[1].strip('"'),
                entry[2].strip('"'),
                entry[3].strip('"'),
                entry[11].strip('"')
            ])
        df_entity = pd.DataFrame(f_entity, columns=["#", "Name", "Type", "Country", "Extra"])
        
        return df_entity

In [197]:
class Entity:
    
    def __init__(self, entity_name):
        'initialize object for entity, including their name'
        self.entity_list = get_entity_SDN() # df of entities on SDN List
        self.name = entity_name
    
    def get_self(self):
        'return entity_name'
        return self.name
    
    def update_name(self, new_entity_name):
        'update name for entity'
        self.name = new_entity_name
        
    def check_SDN(self, match_type):
        'return DataFrame including entities on the SDN List that match based on selected match type'
        types = ['EXACT', 'STRONG', 'MEDIUM', 'LIGHT']
        return_list = []
        if match_type.upper() not in types:
            print('ERROR: string used for match_type does not exist')
            print('Only 4 strings accepted: "{}", "{}", "{}", and "{}"'.format(
                  types[0], types[1], types[2], types[3]))
            return True # Return True (different response than anything normal)
        
        name_matches = []
        count = 0
        while count < len(self.entity_list):
            is_match = name_match(self.name, self.entity_list['Name'][count], match_type.upper())
            if is_match == True:
                name_matches.append(self.entity_list['#'][count])
            count += 1
        if len(name_matches) == 0:
            return False
        else:
            return_df = self.entity_list[self.entity_list['#'].isin(name_matches)]
            return_df.reset_index(inplace=True)
            final_return_df = return_df.drop(columns=['index'])
            return final_return_df        

In [198]:
class Individual:
    
    def __init__(self, last_name, first_name, dob):
        'initialize object for individual, including their name and date of birth'
        # Checking format of DOB before initializing variables for object
        self.individual_list = get_individual_SDN() # df of individuals on SDN List
        if len(dob) == 11:
            self.last_name = last_name
            self.first_name = first_name
            self.dob = dob
        else:
            print('FormatError: dob argument was not formatted correctly (Correct Format: "DD MM YYYY")')
            
    def get_self(self):
        'return tuple of individual information, including name and date of birth'
        try:  
            return (self.last_name, self.first_name, self.dob)
        except AttributeError as e:
            'function not initialized'
    
    def update_first_name(self, first_name):
        'update first name for individual'
        self.first_name = first_name
        
    def update_last_name(self, last_name):
        'update last name for individual'
        self.last_name = last_name
        
    def update_dob(self, dob):
        'update date of birth for individual'
        self.dob = dob
    
    def check_SDN(self, match_type): # 4 match types: 'EXACT', 'STRONG', 'MEDIUM', 'LIGHT'
        'return DataFrame including individuals on the SDN List that match based on selected match type'
        types = ['EXACT', 'STRONG', 'MEDIUM', 'LIGHT']
        return_list = []
        if match_type.upper() not in types:
            print('ERROR: string used for match_type does not exist')
            print('Only 4 strings accepted: "{}", "{}", "{}", and "{}"'.format(
                  types[0], types[1], types[2], types[3]))
            return True # Return True (different response than anything normal)
        
        # EXACT MATCH == DOB match needed, Name match exact
        if match_type.upper() == types[0]:
            both_name_match_df = indiv_name_match(self.last_name, self.first_name, 
                                                 self.individual_list, match_type.upper())
            if len(both_name_match_df) != 0:
                both_name_match_df.reset_index(inplace=True)
                count = 0
                while count < len(both_name_match_df):
                    dob_match = DOB_match(self.dob, both_name_match_df['DOB'][count])
                    if dob_match == True:
                        return_list.append(both_name_match_df['#'][count])
                    count += 1
        
        # STRONG MATCH == DOB match needed, Name match fuzzy logic
        elif match_type.upper() == types[1]:
            both_name_match_df = both_name_match(self.last_name, self.first_name, 
                                                 self.individual_list, match_type.upper())
            if len(both_name_match_df) != 0:
                both_name_match_df.reset_index(inplace=True)
                count = 0
                while count < len(both_name_match_df):
                    dob_match = DOB_match(self.dob, both_name_match_df['DOB'][count])
                    if dob_match == True:
                        return_list.append(both_name_match_df['#'][count])
                    count += 1
            
        # MEDIUM MATCH == DOB match needed, Name match fuzzy logic less strong
        elif match_type.upper() == types[2]:
            both_name_match_df = both_name_match(self.last_name, self.first_name, 
                                                 self.individual_list, match_type.upper())
            if len(both_name_match_df) != 0:
                both_name_match_df.reset_index(inplace=True)
                count = 0
                while count < len(both_name_match_df):
                    dob_match = DOB_match(self.dob, both_name_match_df['DOB'][count])
                    if dob_match == True:
                        return_list.append(both_name_match_df['#'][count])
                    count += 1
            
        # LIGHT MATCH == DOB match not needed, Name match fuzzy logic least strong
        elif match_type.upper() == types[3]:
            both_name_match_df = both_name_match(self.last_name, self.first_name, 
                                                 self.individual_list, match_type.upper())
            if len(both_name_match_df) != 0:
                both_name_match_df.reset_index(inplace=True)
                count = 0
                while count < len(both_name_match_df):
                    return_list.append(both_name_match_df['#'][count])
                    count += 1
            
        if len(return_list) != 0:
            return_df = both_name_match_df[both_name_match_df['#'].isin(return_list)]
            final_return_df = return_df.drop(columns=['level_0', 'index'])
            return final_return_df
        else:
            return False

In [199]:
indiv = Individual('ABBUD', 'Hikmat', '01 Jan 1966')

In [200]:
indiv.check_SDN('STRONG')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_name_match_df.sort_values('First Name', inplace=True)


Unnamed: 0,#,Last Name,First Name,Ind/Entity,Global Tag,Note,Extra,DOB
0,22084,'ABBUD,Hikmat,individual,SYRIA,Scientific Studies and Research Center Employee,DOB 01 Jan 1966; nationality Syria; Scientific...,01 Jan 1966


In [201]:
entity = Entity('CUBACANCUN')

In [202]:
entity.check_SDN('STRONG')

Unnamed: 0,#,Name,Type,Country,Extra
0,590,CUBACANCUN CIGARS AND GIFT SHOPS,-0-,CUBA,-0-
