In [18]:
import pandas as pd
import numpy as np
import re

stop_words = ['BY', 'FOR', 'OF' 'REVERSAL', 'RETURN', 'TRANSFER', 'NEFT', 'FROM', 'AGAINST', 'TO', 'DEBIT', 'THROUGH', 'CHEQUE', 'FOREIGN', 'NO', 'RTGS',
              'UTR', 'INB', 'JAN', 'FEB', 'MAR', 'APR','MAY', 'JUN', 'JUNE', 'JUL','JULY', 'AUG', 'SEP', 'SEPT' 'OCT', 'NOV', 'DEC', 
              'CLOSURE', 'NRTGS', 'IN', 'MR', 'MRS', 'C', 'AC', 'FEES', 'CASH', 'WITHDRAWAL', 'CLG', 'TRF', 'REVERSAL','NEFT_IN',
              'BILLDESK', 'CREDIT', 'TRF', 'TFR', 'TT', 'TR', 'TFRR', 'TF', 'TL', 'MARGIN', 'ETFR', 'B/F', 'T/F', 'BILL ID']

#import spreadsheet to work on
f_name = str(input("Enter file name: "))
df = pd.read_excel (f_name, usecols = ['Particulars'])
#add ID column and make it the index
df.insert(0, 'ID', df.index+2)
df = df.set_index('ID')

#import a copy
df_def = pd.read_excel (f_name, header=0)
#add ID column and make it the index
df_def.insert(0, 'ID', df_def.index+2)
df_def = df_def.set_index('ID')

#remove whitespaces
df['Particulars'] = df['Particulars'].str.strip()

#convert everything to uppercase
df['Particulars'] = df['Particulars'].str.upper()

#add spaces next to special characters
df['Particulars'] = df['Particulars'].str.replace(r'([^&\w\s])'," \\1")

#remove special characters
df['Particulars'] = df['Particulars'].str.replace(r'([^\w\s\&])',"")

#remove alphanumeric and numeric
df['Particulars'] = df['Particulars'].str.replace('\w+\d+', '')
df['Particulars'] = df['Particulars'].str.replace('\d+', '')

#replace na values with single space
df['Particulars'] = df['Particulars'].fillna(" ")

#modify for specific keywords
df.loc[df['Particulars'].str.contains('|'.join(['INT', 'INTEREST']), case=False), 'Particulars'] = 'Interest'
df.loc[df['Particulars'].str.contains('|'.join(['INB', 'EOD']), case=False), 'Particulars'] = 'Interbank Transfer'
df.loc[df['Particulars'].str.contains('GST', case=False), 'Particulars'] = 'GST Refund'
df.loc[df['Particulars'].str.contains('SMS', case=False), 'Particulars'] = 'SMS Charges'
df.loc[df['Particulars'].str.contains('|'.join(['SALARY', 'WAGES', 'WAGE']), case=False), 'Particulars'] = 'Salary & Wages'
df.loc[df['Particulars'].str.contains('FOREX', case=False), 'Particulars'] = 'Foreign Currency Conversion Tax'
df.loc[df['Particulars'].str.contains('CAR', case=False), 'Particulars'] = 'Maintainence Charges'
df.loc[df['Particulars'].str.contains('CASH' and 'DEPOSIT', case=False), 'Particulars'] = 'Cash Deposit'
df.loc[df['Particulars'].str.contains('WCL', case=False), 'Particulars'] = 'Repayment of WDCL'
df.loc[df['Particulars'].str.contains('|'.join(['FRUIT MASTER', 'FRUIT MSTR']), case=False), 'Particulars'] = 'Fruit Master Agro'
df.loc[df['Particulars'].str.contains('ANAMI JEWELLERS', case=False), 'Particulars'] = 'Anami Jewellers Private Limited'
df.loc[df['Particulars'].str.contains('BCCALC', case=False), 'Particulars'] = 'Bccalc Recovery Charges'

#remove specific unwanted words
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
df['Particulars'] = df['Particulars'].str.replace(pat, '')

#remove whitespaces
df['Particulars'] = df['Particulars'].str.strip()

#text formatting
df['Particulars'] = df['Particulars'].str.title()

#define all unidentified cases
df = df.applymap(lambda x: '- Unidentified -' if (x == r'(.) ') else x)
df = df.applymap(lambda x: '- Unidentified -' if isinstance(x, str) and ((not x) or (x.isspace()) or (len(x)==1)) else x)
df.loc[df['Particulars'].str.contains('Does not', case=False), 'Particulars'] = '- Unidentified -'

#change edited columns name to Remarks
df.rename(columns = {'Particulars':'Remarks'}, inplace = True)

#export remarks as excel
remark_df = pd.merge(df_def, df, on='ID')
remark_df.to_excel(str(f_name.split(".", 1)[0]) + ' - remarks_output.xlsx')

#unique remarks and total deposits/withdrawal
sum_dict= {}
for record in remark_df.values:
    remark = record[remark_df.columns.get_loc("Remarks")]
    if remark not in sum_dict:
        sum_dict[remark] = {"Total withdrawal":0,"Total deposit":0}
    withdrawal = record[remark_df.columns.get_loc('WITHDRAWALS')]
    sum_dict[remark]['Total withdrawal']+=withdrawal
    deposit = record[remark_df.columns.get_loc('DEPOSITS')]
    sum_dict[remark]['Total deposit']+=deposit

totals_df = pd.DataFrame.from_dict(sum_dict, orient ='index')

totals_df.head(50)
#remark_df.head(50)

Enter file name: KOF Input.xlsx


Unnamed: 0,Total withdrawal,Total deposit
B F,250,250
Nilkamal,250,250
Charges,17750,17750
Fruit Master Agro,13250,13250
Lpc,250,250
Interest,24000,24000
Bashir,500,500
Tax,250,250
Fruit,5750,5750
Nilkamal Ltd,250,250
