In [1]:
#%pip install fuzzywuzzy

In [2]:
import pandas as pd
import numpy as np
import re
from itertools import product, combinations
from fuzzywuzzy import fuzz, process
import difflib



In [3]:
def clean_string(word):
    '''This function cleans the string text.
    It removes any special character, removes white
    spaces at the beginning or end or in between
    the text, removes any numeric values from
    the string and lastly converts the text
    into lower case'''
    
    word = word.lower()
    
    word = word.split("of")[0]
    word = word.strip()
    word = re.sub("[^A-Za-z]", "", word)
    
    return word

In [4]:
storename_df = pd.read_csv("Top Grocery Stores List.csv")
storename_df.head()

Unnamed: 0,Number,Store Name,Owner
0,1,A & P Food Stores,
1,2,ACME Markets,
2,3,Albertsons,Albertsons
3,4,ALDI,ALDI
4,5,Amazon,Amazon


In [5]:
poiname_df = pd.read_csv("POI_Name.csv")
poiname_df.head()

Unnamed: 0,Record,Country,Country Secondary Subdivision,Country Subdivision,Subdivision Name,Postal Code,Address,Local Name,Municipality,Municipality Subdivision,entryPoints_0_functions_0,Latitude,Longtitude,poi_brands_1_name,POI Name,URL,Source,Store Name
0,813850,United States,Fairfield,CT,Connecticut,06830-7232,"1 Windrose Way, Greenwich, CT 06830",Greenwich,Greenwich,Downtown Greenwich,,41.01518,-73.61,,Uranium Markets,www.uraniummarkets.com,Azure Maps,
1,883850,United States,Fairfield,CT,Connecticut,06830-7138,"4 Davenport Avenue, Greenwich, CT 06830",Greenwich,Greenwich,Downtown Greenwich,,41.01705,-73.62224,,Arguimbau & Co,www.arguimbau-co.com,Azure Maps,
2,923850,United States,Fairfield,CT,Connecticut,06830-6650,"140 Hamilton Avenue, Greenwich, CT 06830",Greenwich,Greenwich,Downtown West-Chickahominy,,41.01875,-73.63877,,Corner Market,cornermarketgreenwich.business.site,Azure Maps,
3,863850,United States,Fairfield,CT,Connecticut,06830-6523,"372 Greenwich Avenue, Greenwich, CT 06830",Greenwich,Greenwich,Downtown Greenwich,,41.0228,-73.62484,,Harvest,,Azure Maps,
4,853850,United States,Fairfield,CT,Connecticut,06830-6507,"244 Greenwich Avenue, Greenwich, CT 06830",Greenwich,Greenwich,Downtown Greenwich,,41.02657,-73.62596,,Lucky Brand,www.luckybrand.com,Azure Maps,


### Aim is to match POI Name column from poiname_df to the Store Name column in storename_df

In [6]:
storename_df["clean_name"] = storename_df.apply(lambda row: clean_string(row["Store Name"]), axis = 1)

In [7]:
poiname_df.dropna(axis=0, subset=["POI Name"], inplace=True)

In [8]:
poiname_df["clean_name"] = poiname_df.apply(lambda row: clean_string(row["POI Name"]), axis = 1)

In [9]:
correct_list = list(set(storename_df["clean_name"].tolist()))

In [10]:
poiname_list = poiname_df["clean_name"].tolist()

In [11]:
len(poiname_df["Record"].unique()) == len(poiname_df)

True

In [12]:
#df1 = poiname_df[poiname_df["clean_name"].isin(correct_list)][["Record", "POI Name", "clean_name"]]

In [13]:
poiname_df["clean_name"][106]

'stopshoppharmacy'

In [14]:
d = {}
for name in poiname_list:
    d[name] = difflib.get_close_matches(name, correct_list, n=1, cutoff=0.70)

In [15]:
df_match = pd.DataFrame(list(zip(list(d.keys()), 
                    [item[0] if len(item) > 0 else "" for item in d.values()])), 
                    columns=["poi_name", "matched_name"])
df_match

Unnamed: 0,poi_name,matched_name
0,uraniummarkets,acmemarkets
1,arguimbauco,
2,cornermarket,centralmarket
3,harvest,
4,luckybrand,
...,...,...
378,indianfoodspice,
379,pricechopper,pricechopper
380,blackgoldfoodmart,
381,americaneurofoods,


In [16]:
mid_df1 = pd.merge(poiname_df, df_match, left_on= "clean_name", right_on= "poi_name", how= "left")
mid_df2 = pd.merge(mid_df1, storename_df, left_on= "matched_name" , right_on= "clean_name", how= "left")

In [17]:
mid_df2.columns

Index(['Record', 'Country', 'Country Secondary Subdivision',
       'Country Subdivision', 'Subdivision Name', 'Postal Code', 'Address',
       'Local Name', 'Municipality', 'Municipality Subdivision',
       'entryPoints_0_functions_0', 'Latitude', 'Longtitude',
       'poi_brands_1_name', 'POI Name', 'URL', 'Source', 'Store Name_x',
       'clean_name_x', 'poi_name', 'matched_name', 'Number', 'Store Name_y',
       'Owner', 'clean_name_y'],
      dtype='object')

In [18]:
#mid_df2

In [19]:
mid_df2.rename(columns={"Store Name_y": "Master Store Name"}, inplace= True)

In [20]:
mid_df2[mid_df2["Master Store Name"].isna() == False][["POI Name", "Master Store Name"]]

Unnamed: 0,POI Name,Master Store Name
0,Uranium Markets,ACME Markets
2,Corner Market,Central Market
5,ACME Markets,ACME Markets
6,Super Stop & Shop,Stop & Shop
10,Fresh Fields Market,Fresh Thyme Market
...,...,...
515,Price Chopper,Price Chopper
516,ShopRite Of Brookfield,ShopRite
517,ShopRite Of Brookfield,ShopRite
520,Shop Rite,ShopRite


In [21]:
mid_df2["Correct Name"] = np.where(mid_df2["Master Store Name"].notnull(), 
                                   mid_df2["Master Store Name"], mid_df2["POI Name"])
mid_df2.head()

Unnamed: 0,Record,Country,Country Secondary Subdivision,Country Subdivision,Subdivision Name,Postal Code,Address,Local Name,Municipality,Municipality Subdivision,...,Source,Store Name_x,clean_name_x,poi_name,matched_name,Number,Master Store Name,Owner,clean_name_y,Correct Name
0,813850,United States,Fairfield,CT,Connecticut,06830-7232,"1 Windrose Way, Greenwich, CT 06830",Greenwich,Greenwich,Downtown Greenwich,...,Azure Maps,,uraniummarkets,uraniummarkets,acmemarkets,2.0,ACME Markets,,acmemarkets,ACME Markets
1,883850,United States,Fairfield,CT,Connecticut,06830-7138,"4 Davenport Avenue, Greenwich, CT 06830",Greenwich,Greenwich,Downtown Greenwich,...,Azure Maps,,arguimbauco,arguimbauco,,,,,,Arguimbau & Co
2,923850,United States,Fairfield,CT,Connecticut,06830-6650,"140 Hamilton Avenue, Greenwich, CT 06830",Greenwich,Greenwich,Downtown West-Chickahominy,...,Azure Maps,,cornermarket,cornermarket,centralmarket,64.0,Central Market,H-E-B,centralmarket,Central Market
3,863850,United States,Fairfield,CT,Connecticut,06830-6523,"372 Greenwich Avenue, Greenwich, CT 06830",Greenwich,Greenwich,Downtown Greenwich,...,Azure Maps,,harvest,harvest,,,,,,Harvest
4,853850,United States,Fairfield,CT,Connecticut,06830-6507,"244 Greenwich Avenue, Greenwich, CT 06830",Greenwich,Greenwich,Downtown Greenwich,...,Azure Maps,,luckybrand,luckybrand,,,,,,Lucky Brand


In [22]:
final_df = mid_df2.drop(columns=["poi_name", "matched_name", "clean_name_x", "clean_name_y"])

In [23]:
#final_df.to_csv("final_matched.csv", index = False)