In [1]:
! pip install transformers

[0m

# Import Dependencies 

In [2]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import pipeline

## Set up pipeline 

In [3]:
classifier = pipeline("zero-shot-classification", model='facebook/bart-large-mnli', device=0 )

Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

### Import Dataset

In [4]:
headlines = pd.read_csv('../input/personal-transaction-data/HDFC_COMMON_CLEAN_V1 (1) (1) (1).csv')

In [5]:
df_head = headlines.head(100)
headlines.head()

Unnamed: 0,TRANSACTION DATE,PARTICULARS,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
0,1/1/2019,POS 541919XXXXXX4335 ASTER MEDICITY POS DEBIT,700.0,,438245.77
1,1/1/2019,POS 541919XXXXXX4335 WESTSIDE POS DEBIT,7291.0,,430954.77
2,1/1/2019,POS 541919XXXXXX4335 EASY DAY POS DEBIT,208.0,,430746.77
3,2/1/2019,20190102032039502112/PAYTMWALLETLOADING,2000.0,,428746.77
4,2/1/2019,50100034192891-TPT-RENT,11838.0,,416908.77


In [6]:
samples = headlines[1:100]

In [7]:
strings = samples.PARTICULARS.values
strings[:3]

array(['POS 541919XXXXXX4335 WESTSIDE POS DEBIT ',
       'POS 541919XXXXXX4335 EASY DAY POS DEBIT ',
       '20190102032039502112/PAYTMWALLETLOADING '], dtype=object)

In [8]:
sequence = list(strings)
sequence[:3]

['POS 541919XXXXXX4335 WESTSIDE POS DEBIT ',
 'POS 541919XXXXXX4335 EASY DAY POS DEBIT ',
 '20190102032039502112/PAYTMWALLETLOADING ']

### Categories

In [9]:
candidate_labels = ['Miscellaneous', 'Transport', 'Entertainment', 'Food', 'Groceries', 'Health', 'Housing', 
                    'Personal Care', 'Phone & Internet', 'Utilities', 'Investment', 'Overdraft Repayment', 
                    'Savings', 'Transfer', 'Atm withdrawal', 'Gift & Donation', 'Online Transaction', 
                    'Payment link', 'Shopping', 'Travel', 'Education', 'Betting', 'Tax', 'Bank Charges']

In [10]:
df_head.head()

Unnamed: 0,TRANSACTION DATE,PARTICULARS,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
0,1/1/2019,POS 541919XXXXXX4335 ASTER MEDICITY POS DEBIT,700.0,,438245.77
1,1/1/2019,POS 541919XXXXXX4335 WESTSIDE POS DEBIT,7291.0,,430954.77
2,1/1/2019,POS 541919XXXXXX4335 EASY DAY POS DEBIT,208.0,,430746.77
3,2/1/2019,20190102032039502112/PAYTMWALLETLOADING,2000.0,,428746.77
4,2/1/2019,50100034192891-TPT-RENT,11838.0,,416908.77


In [11]:
hypothesis_template = "This text is for financial transaction {}."
output = classifier(sequence, candidate_labels, multi_label = True, hypothesis_template=hypothesis_template)

In [12]:
output[0]

{'sequence': 'POS 541919XXXXXX4335 WESTSIDE POS DEBIT ',
 'labels': ['Transfer',
  'Payment link',
  'Travel',
  'Atm withdrawal',
  'Housing',
  'Investment',
  'Transport',
  'Savings',
  'Betting',
  'Utilities',
  'Bank Charges',
  'Shopping',
  'Health',
  'Online Transaction',
  'Education',
  'Miscellaneous',
  'Overdraft Repayment',
  'Tax',
  'Entertainment',
  'Phone & Internet',
  'Groceries',
  'Gift & Donation',
  'Food',
  'Personal Care'],
 'scores': [0.9694666266441345,
  0.9528462886810303,
  0.9031482934951782,
  0.896047055721283,
  0.8867105841636658,
  0.8551644682884216,
  0.8338189125061035,
  0.8053407073020935,
  0.7865632176399231,
  0.7635946273803711,
  0.7514654994010925,
  0.701482892036438,
  0.6928688287734985,
  0.6047329306602478,
  0.4976731538772583,
  0.3006778061389923,
  0.27034661173820496,
  0.2637624144554138,
  0.2633073925971985,
  0.2603875994682312,
  0.24869772791862488,
  0.11175412684679031,
  0.07313081622123718,
  0.07170102000236511]}

In [13]:
op = pd.DataFrame(output)
op.tail(10)

Unnamed: 0,sequence,labels,scores
89,SALARY JAN 19 WEBENZA INDIA PRIVATE LIMI TED,"[Online Transaction, Housing, Shopping, Phone ...","[0.9359395503997803, 0.8790891766548157, 0.861..."
90,POS 541919XXXXXX4335 HARIHARA MEDICAL PO S DEBIT,"[Health, Payment link, Personal Care, Housing,...","[0.9793604612350464, 0.9055398106575012, 0.889..."
91,NWD-541919XXXXXX4335-KBL9016-BANGALORE,"[Housing, Transfer, Travel, Transport, Investm...","[0.8241477012634277, 0.7914208173751831, 0.772..."
92,CRV POS 541919 4335 BPCL 0.75 CASH,"[Payment link, Transport, Savings, Transfer, T...","[0.9777086973190308, 0.9447672367095947, 0.927..."
93,MONTHLY INTEREST CREDIT 50300306131334,"[Investment, Payment link, Utilities, Housing,...","[0.8562614917755127, 0.8091872334480286, 0.669..."
94,MONTHLY INTEREST CREDIT 50300306138000,"[Investment, Payment link, Utilities, Housing,...","[0.91144198179245, 0.7437950372695923, 0.60947..."
95,POS 541919XXXXXX4335 ITUNES.COM/BILI POS DEBIT,"[Phone & Internet, Online Transaction, Payment...","[0.989783525466919, 0.9873908162117004, 0.9831..."
96,20190215105839821857/PAYTMOYOROOMSCOM,"[Payment link, Online Transaction, Shopping, M...","[0.995840847492218, 0.9931667447090149, 0.9851..."
97,POS 541919XXXXXX4335 M/S SUGAR RUSH V PO S DEBIT,"[Payment link, Transfer, Food, Atm withdrawal,...","[0.9892659783363342, 0.9835600852966309, 0.944..."
98,IMPS-904811141976-V ARUN KUMAR REDDY-ICI C-XXX...,"[Payment link, Transfer, Atm withdrawal, Inves...","[0.9369490146636963, 0.8793179988861084, 0.830..."


In [14]:
# op.to_csv('sequence.csv')

## Converting the result into DataFrame with the maximum score
### with one category each

In [15]:
output_1 = []
filter_key1 = ['labels']

for index, row in df_head.iterrows():
  d = {}
  seq = row['PARTICULARS']
  result = classifier(seq, candidate_labels, multi_label=True, hypothesis_template=hypothesis_template)
  temp_label = list(map(result.get, filter_key1))
  d['TRANSACTION DATE'] = row['TRANSACTION DATE']
  d['PARTICULARS'] = row['PARTICULARS']
  d['CATEGORIES'] = temp_label[0][0]
  d['WITHDRAWAL AMT'] = row['WITHDRAWAL AMT']
  d['DEPOSIT AMT'] = row['DEPOSIT AMT']
  d['BALANCE AMT'] = row['BALANCE AMT']
  output_1.append(d) 
    



In [16]:
 #convert the list of dictionary into pandas DataFrame
 new = pd.DataFrame(output_1)
 new.head(10)

Unnamed: 0,TRANSACTION DATE,PARTICULARS,CATEGORIES,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
0,1/1/2019,POS 541919XXXXXX4335 ASTER MEDICITY POS DEBIT,Transfer,700.0,,438245.77
1,1/1/2019,POS 541919XXXXXX4335 WESTSIDE POS DEBIT,Transfer,7291.0,,430954.77
2,1/1/2019,POS 541919XXXXXX4335 EASY DAY POS DEBIT,Payment link,208.0,,430746.77
3,2/1/2019,20190102032039502112/PAYTMWALLETLOADING,Transfer,2000.0,,428746.77
4,2/1/2019,50100034192891-TPT-RENT,Transport,11838.0,,416908.77
5,2/1/2019,PHDF7063992589/BILLDKACTTV,Transfer,1249.62,,415659.15
6,3/1/2019,POS 541919XXXXXX4335 ASHWIN PHARMA POS D EBIT,Investment,341.0,,390318.15
7,3/1/2019,50400124720680- RD INSTALLMENT-JAN 2019,Investment,25000.0,,390659.15
8,3/1/2019,UPI-50100121158115-SHAMIM.MOKLES@OKHDFCB ANK-P...,Payment link,,600.0,390111.57
9,4/1/2019,POS 541919XXXXXX4335 ITUNES.COM/BILLPOS DEBIT,Phone & Internet,199.0,,389912.57


### Save output as csv

In [17]:
new.to_csv('newdataset_1.csv', index=False)

## Converting the result into DataFrame with the maximum score
### with two(2) category each

In [19]:
output_2 = []
filter_key1 = ['labels']

for index, row in df_head.iterrows():
  d = {}
  seq = row['PARTICULARS']
  result = classifier(seq, candidate_labels, multi_label=True, hypothesis_template=hypothesis_template)
  temp_label = list(map(result.get, filter_key1))
#   temp_score = list(map(result.get, filter_key2))
  d['TRANSACTION DATE'] = row['TRANSACTION DATE']
  d['PARTICULARS'] = row['PARTICULARS']
  d['CATEGORIES'] = temp_label[0][:2]
  d['WITHDRAWAL AMT'] = row['WITHDRAWAL AMT']
  d['DEPOSIT AMT'] = row['DEPOSIT AMT']
  d['BALANCE AMT'] = row['BALANCE AMT']
  output_2.append(d) 

In [20]:
#convert the list of dictionary into pandas DataFrame
new_2 = pd.DataFrame(output_2)
new_2.head(10)

Unnamed: 0,TRANSACTION DATE,PARTICULARS,CATEGORIES,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
0,1/1/2019,POS 541919XXXXXX4335 ASTER MEDICITY POS DEBIT,"[Transfer, Payment link]",700.0,,438245.77
1,1/1/2019,POS 541919XXXXXX4335 WESTSIDE POS DEBIT,"[Transfer, Payment link]",7291.0,,430954.77
2,1/1/2019,POS 541919XXXXXX4335 EASY DAY POS DEBIT,"[Payment link, Transfer]",208.0,,430746.77
3,2/1/2019,20190102032039502112/PAYTMWALLETLOADING,"[Transfer, Payment link]",2000.0,,428746.77
4,2/1/2019,50100034192891-TPT-RENT,"[Transport, Payment link]",11838.0,,416908.77
5,2/1/2019,PHDF7063992589/BILLDKACTTV,"[Transfer, Payment link]",1249.62,,415659.15
6,3/1/2019,POS 541919XXXXXX4335 ASHWIN PHARMA POS D EBIT,"[Investment, Housing]",341.0,,390318.15
7,3/1/2019,50400124720680- RD INSTALLMENT-JAN 2019,"[Investment, Housing]",25000.0,,390659.15
8,3/1/2019,UPI-50100121158115-SHAMIM.MOKLES@OKHDFCB ANK-P...,"[Payment link, Housing]",,600.0,390111.57
9,4/1/2019,POS 541919XXXXXX4335 ITUNES.COM/BILLPOS DEBIT,"[Phone & Internet, Payment link]",199.0,,389912.57


In [None]:
new_2.to_csv('newdataset_2.csv', index=False)

## Conclusion and Recommendation

I trained on zero text classification pipeline

### Recommendation 

I recommended that some percentage of this dataset should be labeled. Reason being that the business has given name based on culture and geography, which is not a usual language sentence or word.  So a label dataset will give more accuracy.    
