In [37]:
#%%timeit
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import process, fuzz

stop_words = ['BY','FOR','OF','REVERSAL','RETURN','TRANSFER','NEFT','FROM','AGAINST','TO','DEBIT','THROUGH','CHEQUE','FOREIGN',
              'NO','RTGS','UTR','INB','JAN','JANUARY','FEB','FEBRUARY','MAR','MARCH','APR','APRIL','MAY','JUN','JUNE','JUL',
              'JULY','AUG','AUGUST','SEP','SEPT','SEPTEMBER','OCT','OCTOBER','NOV','NOVEMBER','DEC','DECEMBER','CLOSURE',
              'NRTGS','IN','MR','MRS','C','AC','FEES','CASH','WITHDRAWAL','CLG','TRF','REVERSAL','NEFT_IN','NEFT_OUT','NEFT_CHRG',
              'BILLDESK','CREDIT','TRF','TFR','TT','TR','TFRR','TF','TL','MARGIN','ETFR','B/F','T/F','BILL ID','IMPS','DR','TXT',
              'SFMS','SCBL','SBIN','ICIC','ICICI','HDFC','ORBC','MAHB','HDFC','PUNB','BARB','UTIB','XLSX','BULK']

#import spreadsheet to work on
f_name = str(input("Enter file name: "))
df = pd.read_excel (f_name, usecols = ['Particulars'])
#add ID column and make it the index
df.insert(0, 'ID', df.index+2)
df = df.set_index('ID')

#import a copy
df_def = pd.read_excel (f_name, header=0)
#add ID column and make it the index
df_def.insert(0, 'ID', df_def.index+2)
df_def = df_def.set_index('ID')

#remove whitespaces
df['Particulars'] = df['Particulars'].str.strip()

#convert everything to uppercase
df['Particulars'] = df['Particulars'].str.upper()

#add spaces next to special characters
df['Particulars'] = df['Particulars'].str.replace(r'([^&\w\s])'," \\1", regex=True)

#remove special characters
df['Particulars'] = df['Particulars'].str.replace(r'([^\w\s\&])',"", regex=True)

#remove alphanumeric and numeric
df['Particulars'] = df['Particulars'].str.replace('\w+\d+', '', regex=True)
df['Particulars'] = df['Particulars'].str.replace('\d+', '', regex=True)

#replace na values with single space
df['Particulars'] = df['Particulars'].fillna(" ")

#modify for specific keywords
df.loc[df['Particulars'].str.contains('|'.join(['INT', 'INTEREST']), case=False), 'Particulars'] = 'Interest'
df.loc[df['Particulars'].str.contains('|'.join(['INB', 'EOD']), case=False), 'Particulars'] = 'Interbank Transfer'
df.loc[df['Particulars'].str.contains('GST', case=False), 'Particulars'] = 'GST Refund'
df.loc[df['Particulars'].str.contains('SMS', case=False), 'Particulars'] = 'SMS Charges'
df.loc[df['Particulars'].str.contains('|'.join(['SALARY', 'WAGES', 'WAGE']), case=False), 'Particulars'] = 'Salary & Wages'
df.loc[df['Particulars'].str.contains('FOREX', case=False), 'Particulars'] = 'Foreign Currency Conversion Tax'
df.loc[df['Particulars'].str.contains('CAR', case=False), 'Particulars'] = 'Maintainence Charges'
df.loc[df['Particulars'].str.contains('CASH' and 'DEPOSIT', case=False), 'Particulars'] = 'Cash Deposit'
df.loc[df['Particulars'].str.contains('WCL', case=False), 'Particulars'] = 'Repayment of WDCL'
df.loc[df['Particulars'].str.contains('|'.join(['FRUIT MASTER', 'FRUIT MSTR']), case=False), 'Particulars'] = 'Fruit Master Agro'
df.loc[df['Particulars'].str.contains('ANAMI JEWELLERS', case=False), 'Particulars'] = 'Anami Jewellers Private Limited'
df.loc[df['Particulars'].str.contains('BCCALC', case=False), 'Particulars'] = 'Bccalc Recovery Charges'

#remove specific unwanted words
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
df['Particulars'] = df['Particulars'].str.replace(pat, '', regex=True)

#remove whitespaces
df['Particulars'] = df['Particulars'].str.strip()

#text formatting
df['Particulars'] = df['Particulars'].str.title()

#define all unidentified cases
df = df.applymap(lambda x: '- Unidentified -' if (x == r'(.) ') else x)
df = df.applymap(lambda x: '- Unidentified -' if isinstance(x, str) and ((not x) or (x.isspace()) or (len(x)==1)) else x)
df.loc[df['Particulars'].str.contains('Does not', case=False), 'Particulars'] = '- Unidentified -'

#change edited column name to 'Remarks'
df.rename(columns = {'Particulars':'Remarks'}, inplace = True)
remark_df = pd.merge(df_def, df, on='ID')
    
#make a copy of remark_df for export
final_remark_df = remark_df

# --- entry resolution --- #

resolution_threshold = 90
resolved_list = []
unique_remarks = remark_df['Remarks'].unique().tolist()
similar_list = []
for item_A in unique_remarks:
    for item_B in unique_remarks:
        ratio = fuzz.token_sort_ratio(item_A, item_B)
        if ratio >= resolution_threshold and ratio != 100:
            resolved_str = str(item_A + ',' + item_B + ',' + str(ratio))
            resolved_list.append(resolved_str)
resolved_list = sorted(resolved_list)
print(resolved_list)


# --- totals sheet --- #


#remove words from numerical columns
remark_df['WITHDRAWALS'] = remark_df['WITHDRAWALS'].replace(r'([/\D+/g])',0, regex=True).astype(float)
remark_df['DEPOSITS'] = remark_df['DEPOSITS'].replace(r'([/\D+/g])',0, regex=True).astype(float)

#remove commmas,blanks from numerical columns
remark_df['WITHDRAWALS'] = remark_df['WITHDRAWALS'].replace(',', '').astype(float)
remark_df['DEPOSITS'] = remark_df['DEPOSITS'].replace(',', '').astype(float)
remark_df['WITHDRAWALS'] = remark_df['WITHDRAWALS'].fillna(0)
remark_df['DEPOSITS'] = remark_df['DEPOSITS'].fillna(0)
remark_df['WITHDRAWALS'] = remark_df['WITHDRAWALS'].replace(' ', 0).astype(float)
remark_df['DEPOSITS'] = remark_df['DEPOSITS'].replace(' ', 0).astype(float)

#total deposits/withdrawal
sum_dict= {}
for record in remark_df.values:
    remark = record[remark_df.columns.get_loc("Remarks")]
    if remark not in sum_dict:
        sum_dict[remark] = {"Total withdrawal":0,"Total deposit":0}
    withdrawal = record[remark_df.columns.get_loc('WITHDRAWALS')]
    sum_dict[remark]['Total withdrawal'] += withdrawal
    deposit = record[remark_df.columns.get_loc('DEPOSITS')]
    sum_dict[remark]['Total deposit'] += deposit
    
#totals dataframe
final_totals_df = pd.DataFrame.from_dict(sum_dict, orient ='index')


writer = pd.ExcelWriter(str(f_name.split(".", 1)[0]) + ' - processed.xlsx', engine='xlsxwriter')
final_remark_df.to_excel(writer, sheet_name='Remarks')
final_totals_df.to_excel(writer, sheet_name='Totals')
writer.save()

#totals_df.head(50)
#remark_df.head(50)

Enter file name: KOF Input.xlsx
['Ahad Hotel,Ahad Hotels,95', 'Ahad Hotels,Ahad Hotel,95', 'Faiasa,Faiasal,92', 'Faiasal,Faiasa,92', 'Faiasal,Faisal,92', 'Faisaal,Faisal,92', 'Faisal,Faiasal,92', 'Faisal,Faisaal,92', 'Fayaz Ahmad,Fiyaz Ahmad,91', 'Fiyaz Ahmad,Fayaz Ahmad,91', 'Frruit Master,Fruitmaster,92', 'Frruit Master,Fruiut Master,92', 'Fruitmaster,Frruit Master,92', 'Fruitmaster,Fruiut Master,92', 'Fruiut Master,Frruit Master,92', 'Fruiut Master,Fruitmaster,92', 'Gh Mohamamd,Gh Mohammad,91', 'Gh Mohammad,Gh Mohamamd,91', 'Green Valeey,Green Valey,96', 'Green Valeey,Green Valley,92', 'Green Valey,Green Valeey,96', 'Green Valey,Green Valley,96', 'Green Valley,Green Valeey,92', 'Green Valley,Green Valey,96', 'Kahmir,Kashmir,92', 'Kashmir Orchard,Kashmir Orchards,97', 'Kashmir Orchard,Kashmirorchard,97', 'Kashmir Orchards,Kashmir Orchard,97', 'Kashmir Orchards,Kashmirorchard,93', 'Kashmir,Kahmir,92', 'Kashmirorchard,Kashmir Orchard,97', 'Kashmirorchard,Kashmir Orchards,93', 'Lands En