In [2]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz

In [3]:
df = pd.read_excel("KOF Input - remarks_output.xlsx")
df.head(5)

Unnamed: 0,ID,Bank Name,A/C No.,A/C Type,DATE,Particulars,CHQ.NO./REF.NO.,WITHDRAWALS,DEPOSITS,BALANCE,Remarks
0,2,J&K Bank A/c,249,CC Scheme,2015-10-20,B/F,,250,250,250,B F
1,3,J&K Bank A/c,249,CC Scheme,2015-10-20,RTGS-NILKAMAL LIMITEDJAKAH15293031129,618607 JAKA,250,250,250,Nilkamal
2,4,J&K Bank A/c,249,CC Scheme,2015-10-20,RTGS-CHARGES-JAKAH15293031129CJAKAH15293031129,CJAKAH152930,250,250,250,Charges
3,5,J&K Bank A/c,249,CC Scheme,2015-10-20,FRUIT MASTER,618608,250,250,250,Fruit Master Agro
4,6,J&K Bank A/c,249,CC Scheme,2015-10-20,FRUIT MASTER,618609,250,250,250,Fruit Master Agro


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ID               974 non-null    int64         
 1   Bank Name        974 non-null    object        
 2   A/C No.          974 non-null    int64         
 3   A/C Type         974 non-null    object        
 4   DATE             974 non-null    datetime64[ns]
 5   Particulars      973 non-null    object        
 6   CHQ.NO./REF.NO.  669 non-null    object        
 7   WITHDRAWALS      974 non-null    int64         
 8   DEPOSITS         974 non-null    int64         
 9   BALANCE          974 non-null    int64         
 10  Remarks          974 non-null    object        
dtypes: datetime64[ns](1), int64(5), object(5)
memory usage: 83.8+ KB


In [5]:
for col in df[['Remarks']]:
    df[col] = df[col].str.strip()
    print('Number of unique values in ' + str(col) +' is ' + str(df[col].nunique()))

Number of unique values in Remarks is 252


In [6]:
unique_remarks = df['Remarks'].unique().tolist()
sorted(unique_remarks)[:20]

['- Unidentified -',
 'Ab Gaffar',
 'Ab Gaffar Parray',
 'Ab Gani Parray',
 'Ab Majeed',
 'Ab Majid',
 'Ab Rehman',
 'Agro Fresh',
 'Ahad',
 'Ahad Hotel',
 'Ahad Hotels',
 'Ahm',
 'Ajaz',
 'Ajaz Ah Mir',
 'Alim',
 'Aliya',
 'Amb Charges',
 'Amrapali',
 'Anami Jewellers Private Limited',
 'Anil']

In [7]:
process.extract('Amb', unique_remarks, scorer=fuzz.token_sort_ratio)[:100]

[('Ahm', 67), ('Alim', 57), ('Samina', 44), ('Shamim', 44), ('Ghulam', 44)]

In [8]:
process.extract("Fruit Master Agro", unique_remarks, scorer=fuzz.token_sort_ratio)

[('Fruit Master Agro', 100),
 ('Frruit Master', 80),
 ('Fruiut Master', 80),
 ('Fruitmaster', 79),
 ('Agro Fresh', 59)]

In [9]:
score_sort = [(x,) + i
             for x in unique_remarks 
             for i in process.extract(x, unique_remarks, scorer=fuzz.token_sort_ratio)]

In [10]:
similarity_sort = pd.DataFrame(score_sort, columns=['remarks_sort','match_sort','score_sort'])
similarity_sort.head(100)

Unnamed: 0,remarks_sort,match_sort,score_sort
0,B F,B F,100
1,B F,Ab Gaffar,50
2,B F,Faisal Burza,40
3,B F,Min Bal,40
4,B F,Ab Majid,36
...,...,...,...
95,Himalyan,Himalyan,100
96,Himalyan,Aliya,62
97,Himalyan,Hassan,57
98,Himalyan,Salia Ikram,53


In [11]:
similarity_sort['sorted_remarks_sort'] = np.minimum(similarity_sort['remarks_sort'], similarity_sort['match_sort'])
similarity_sort.head(100)

Unnamed: 0,remarks_sort,match_sort,score_sort,sorted_remarks_sort
0,B F,B F,100,B F
1,B F,Ab Gaffar,50,Ab Gaffar
2,B F,Faisal Burza,40,B F
3,B F,Min Bal,40,B F
4,B F,Ab Majid,36,Ab Majid
...,...,...,...,...
95,Himalyan,Himalyan,100,Himalyan
96,Himalyan,Aliya,62,Aliya
97,Himalyan,Hassan,57,Hassan
98,Himalyan,Salia Ikram,53,Himalyan


In [12]:
high_score_sort = similarity_sort[(similarity_sort['score_sort'] >= 80) &
                                      (similarity_sort['remarks_sort'] != similarity_sort['match_sort']) &
                                      (similarity_sort['sorted_remarks_sort'] != similarity_sort['match_sort'])]

In [13]:
high_score_sort.groupby(['remarks_sort','score_sort']).agg(
                        {'match_sort': ', '.join}).sort_values(
                        ['score_sort'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,match_sort
remarks_sort,score_sort,Unnamed: 2_level_1
M M,100,M&M
M & M,100,"M M, M&M"
Lands End Automotive,98,Lands End Automotives
Mohammad Siddique,97,Mohammad Sidique
Kashmir Orchard,97,"Kashmirorchard, Kashmir Orchards"
...,...,...
M & M,80,Mm
M M,80,Mm
M&M,80,Mm
Mh Sidiq,80,Mohd Sidique


In [14]:
score_set = [(x,) + i
             for x in unique_remarks
             for i in process.extract(x, unique_remarks, scorer=fuzz.token_set_ratio)]

In [15]:
similarity_set = pd.DataFrame(score_set, columns=['remarks_set','match_set','score_set'])
similarity_set['sorted_remarks_set'] = np.minimum(similarity_set['remarks_set'], similarity_set['match_set'])

#Pick values
high_score_set = similarity_set[(similarity_set['score_set'] >= 80) & 
                                    (similarity_set['remarks_set'] != similarity_set['match_set']) & 
                                    (similarity_set['sorted_remarks_set'] != similarity_set['match_set'])]

#Drop the representative value column
high_score_set = high_score_set.drop('sorted_remarks_set',axis=1).copy()

In [16]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
high_score_set.groupby(['match_set','score_set']).agg(
                       {'remarks_set': ', '.join}).sort_values(
                       ['score_set'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,remarks_set
match_set,score_set,Unnamed: 2_level_1
Ab Gaffar Parray,100,Ab Gaffar
Hassan Energy,100,Hassan
Mudasir Bashir Mir,100,"Bashir, Mudasir Bashir, Mudasir"
Mudasir Bashir,100,"Bashir, Mudasir"
Mudasir Bashi\n R Mir,100,Mudasir
Met Kashmir,100,Kashmir
Majid,100,Ab Majid
Maintainence Charges,100,Charges
M&M Cottage,100,"M M, M&M, M & M"
M&M,100,"M M, M Mchq, M Yousuf, M & M"
