In [1]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz



In [5]:
# load data
path1 = "./Data/withEOD.csv"
path2 = "./Data/metaData_IB_Inv.csv"

df_IB_Inv = pd.read_csv(path2) 
df_eod = pd.read_csv(path1)  

df_eod

Unnamed: 0,Code-EOD,Country-EOD,Currency-EOD,Exchange-EOD,ISIN,Name-EOD,Type-EOD,previousClose-EOD,previousCloseDate-EOD,country-Inv,...,symbol-Inv,asset_class-Inv,Name_197,industry-IB,longName-IB,marketName-IB,Symbol-IB,Country-IB,Exchange-IB,Currency-IB
0,TWOU,USA,USD,US,US90214J1016,2U Inc,Common Stock,9.200,2022-08-02,united states,...,TWOU,equity,2U INC,"Consumer, Non-cyclical",2U INC,NMS,TWOU,,,
1,88E,UK,GBX,LSE,AU00000088E2,88 Energy Ltd,Common Stock,0.625,2022-08-03,united kingdom,...,88E,equity,88 ENERGY LTD,Energy,88 ENERGY LTD,88E,88E,United Kingdom,LSE,GBP
2,7063,Japan,JPY,TSE,JP3160590000,A Dot Co Ltd,Common Stock,2064.000,2022-08-03,japan,...,7063,equity,A DOT CO LTD,,,,,,,
3,2018,Hong Kong,HKD,HK,KYG2953R1149,AAC Technologies Holdings Inc,Common Stock,14.580,2022-08-03,hong kong,...,2018,equity,AAC TECHNOLOGIES HOLDINGS IN,Industrial,AAC TECHNOLOGIES HOLDINGS IN,2018,2018,,,
4,2686,Hong Kong,HKD,HK,KYG000371040,AAG Energy Holdings Ltd,Common Stock,1.410,2022-08-03,hong kong,...,2686,equity,AAG ENERGY HOLDINGS LTD,Energy,AAG ENERGY HOLDINGS LTD,2686,2686,Hong Kong,SEHK,HKD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17506,,,,,US98979J1097,,,,,,...,,,ZOE'S KITCHEN INC,"Consumer, Cyclical",ZOE'S KITCHEN INC,ZOES,ZOES,,,
17507,,,,,CA98977K1084,,,,,,...,,,ZOLOTO RESOURCES LTD,Basic Materials,ZOLOTO RESOURCES LTD,ZR,ZR,,,
17508,,,,,CA98977R1038,,,,,,...,,,ZOOMMED INC,Technology,ZOOMMED INC,ZMD.H,ZMD.H,,,
17509,,,,,GB00BMHTHT14,,,,,,...,,,ZPG PLC,Communications,ZPG PLC,ZPG,ZPG,,,


In [6]:
# Select the ISIN Column for merging and the Name Column for matching
df_eodName = df_eod[['ISIN', 'Name-EOD']].dropna(subset=['Name-EOD'])
df_name197 = df_IB_Inv[['ISIN', 'Name_197']]

df_eodName

Unnamed: 0,ISIN,Name-EOD
0,US90214J1016,2U Inc
1,AU00000088E2,88 Energy Ltd
2,JP3160590000,A Dot Co Ltd
3,KYG2953R1149,AAC Technologies Holdings Inc
4,KYG000371040,AAG Energy Holdings Ltd
...,...,...
13025,INE768C01010,Zydus Wellness Limited
13026,CA98985W1023,Zymeworks Inc
13027,US98986X1090,Zynerba Pharmaceuticals Inc
13028,US98986M1036,Zynex Inc


In [7]:
# Inner Merge dfs on ISIN 
# This is the sample input data
df_name = pd.merge(df_name197, df_eodName, how='inner', on='ISIN')

df_name

Unnamed: 0,ISIN,Name_197,Name-EOD
0,US90214J1016,2U INC,2U Inc
1,AU00000088E2,88 ENERGY LTD,88 Energy Ltd
2,JP3160590000,A DOT CO LTD,A Dot Co Ltd
3,KYG2953R1149,AAC TECHNOLOGIES HOLDINGS IN,AAC Technologies Holdings Inc
4,KYG000371040,AAG ENERGY HOLDINGS LTD,AAG Energy Holdings Ltd
...,...,...,...
13153,INE768C01010,ZYDUS WELLNESS LTD,Zydus Wellness Limited
13154,CA98985W1023,ZYMEWORKS INC,Zymeworks Inc
13155,US98986X1090,ZYNERBA PHARMACEUTICALS INC,Zynerba Pharmaceuticals Inc
13156,US98986M1036,ZYNEX INC,Zynex Inc


In [23]:
# Match by Name (The Name columns must be 2nd and 3rd columns of the input dataframe)

def matchByName(df_name):
    allD = []

    for index, row in df_name.iterrows():
        # calculate nlp scores
        fuzz_socre1 = fuzz.token_sort_ratio(row[1],row[2]) 
        
        # list append fuzz result
        row['fuzz_score1'] = fuzz_socre1
        allD.append(row)
       
        # print progress
        if index % (len(df_name)//10) == 0:
            print("progress: %.2f" % (100*index/len(df_name)) + "%")
    
    # list to df and drop duplicates
    df_match = pd.DataFrame(allD).drop_duplicates(subset=['ISIN'])
    
    return df_match

In [24]:
df_nameMatch = matchByName(df_name) # Company names columns be at 2nd and 3rd columns of the input dataframe

df_nameMatch

progress: 0.00%
progress: 9.99%
progress: 19.99%
progress: 29.98%
progress: 39.98%
progress: 49.97%
progress: 59.96%
progress: 69.96%
progress: 79.95%
progress: 89.95%
progress: 99.94%


Unnamed: 0,ISIN,Name_197,Name-EOD,fuzz_score1
0,US90214J1016,2U INC,2U Inc,100
1,AU00000088E2,88 ENERGY LTD,88 Energy Ltd,100
2,JP3160590000,A DOT CO LTD,A Dot Co Ltd,100
3,KYG2953R1149,AAC TECHNOLOGIES HOLDINGS IN,AAC Technologies Holdings Inc,98
4,KYG000371040,AAG ENERGY HOLDINGS LTD,AAG Energy Holdings Ltd,100
...,...,...,...,...
13153,INE768C01010,ZYDUS WELLNESS LTD,Zydus Wellness Limited,90
13154,CA98985W1023,ZYMEWORKS INC,Zymeworks Inc,100
13155,US98986X1090,ZYNERBA PHARMACEUTICALS INC,Zynerba Pharmaceuticals Inc,100
13156,US98986M1036,ZYNEX INC,Zynex Inc,100


In [25]:
# select rows with score >80

df_nameMatch[df_nameMatch['fuzz_score1']>80]

Unnamed: 0,ISIN,Name_197,Name-EOD,fuzz_score1
0,US90214J1016,2U INC,2U Inc,100
1,AU00000088E2,88 ENERGY LTD,88 Energy Ltd,100
2,JP3160590000,A DOT CO LTD,A Dot Co Ltd,100
3,KYG2953R1149,AAC TECHNOLOGIES HOLDINGS IN,AAC Technologies Holdings Inc,98
4,KYG000371040,AAG ENERGY HOLDINGS LTD,AAG Energy Holdings Ltd,100
...,...,...,...,...
13150,CNE100001TL0,ZUOLI KECHUANG MICRO-FINAN-H,Zuoli Kechuang Micro-finance Co Ltd,83
13153,INE768C01010,ZYDUS WELLNESS LTD,Zydus Wellness Limited,90
13154,CA98985W1023,ZYMEWORKS INC,Zymeworks Inc,100
13155,US98986X1090,ZYNERBA PHARMACEUTICALS INC,Zynerba Pharmaceuticals Inc,100


In [None]:
# select rows with score >80

df_nameMatch[df_nameMatch['fuzz_score1']>80]

In [27]:
print("Done")

Done
