# Transactions Narrative Features Extractions (TEST Dataset)

We need to apply the same methodology on the test data and extract payment patterns for each of the clients

In [1]:
import pandas as pd
import numpy as np
import hashlib
import math

import matplotlib.pyplot as plt
%matplotlib inline 

from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
df_test_abt = pd.read_csv("../specs/clean/Test Sample/TEST - AbastractBaseTable.csv", encoding='latin-1', index_col=False)
df_test_abt.head(5)

df_test_abt['LastTransactionNarrative'] = df_test_abt['LastTransactionNarrative'].apply(str)
df_test_abt.dtypes


Unnamed: 0,ClientID,Age,Gender,County,IncomeGroup,HeldLoanPreviously,NumberOfProductsInbank,AverageTXNAmount,NumTransactions,LastTXNAmount,MerchantCode,LastTransactionNarrative
0,10001,59,1,Cork,10001 - 40000,0,4,22,2,12.59,7375,MYWHEELS IE DUBLIN 2
1,10002,27,1,Kerry,10001 - 40000,0,4,11,0,30.0,7531,MAXOL/MACE BRENNAN'S NEWBRIDGE
2,10003,58,0,Louth,10001 - 40000,0,2,9,28,1003.01,5533,Finglas Autoparts Limit Dublin
3,10004,45,1,Dublin,60001 - 100000,0,2,34,31,873.25,3692,DOUBLETREE CHELSEA
4,10005,21,0,Dublin,40001 - 60000,0,1,38,12,926.75,3659,TAJ HOTELS INTERNATIONAL Aurangabad


ClientID                      int64
Age                           int64
Gender                        int64
County                       object
IncomeGroup                  object
HeldLoanPreviously            int64
NumberOfProductsInbank        int64
AverageTXNAmount              int64
NumTransactions               int64
LastTXNAmount               float64
MerchantCode                  int64
LastTransactionNarrative     object
dtype: object

## 1. Frequency of a given payment

In [3]:
hist = {}
for n in list(df_test_abt['LastTransactionNarrative']):
    if not isinstance(n, str):
        continue
    if n == 'nan':
        continue
        
    hash_digest = hashlib.md5(n.encode('utf-8').strip().upper()).hexdigest()
    
    if hash_digest in hist:
        hist[hash_digest] += 1
    else:
        hist[hash_digest] = 1

df_test_narrative_hist = pd.DataFrame(list(sorted(hist.items())), columns=['TXN', 'freq'])
df_test_narrative_hist.sort_values(by=["freq"], ascending=False).head(10)


Unnamed: 0,TXN,freq
899,8f33d4801754d7035f33e148ee600b41,11
1417,dffd447533587dcf45fa73c67b0618f0,7
740,7715e5b7867b90aee7df43c853ea715e,6
976,9a09d2224dde015a385db78d938cd512,6
1088,aa80fc976f690862e310cd0f39eeb3ef,5
522,52313d7eb037d413c5dfa5d8897c0df2,5
80,0ac2367e83239039aa9ca9f57abda250,5
1633,fbaefb97c43bec7acc99d0e9293eb4b2,5
1587,f453288f5900f6c7f0855b95868c08ac,5
343,3481b19da57830ca98c69f29cd238329,5


In [4]:
df_txn_features = df_test_abt.copy()[['ClientID', 'LastTransactionNarrative']]

def txn_rank(x):
    hash_digest = hashlib.md5(x.encode('utf-8').strip().upper()).hexdigest()
    return hist.get(hash_digest)
    #return df_train_narrative_hist[df_train_narrative_hist.TXN.str.contains(x)]

df_txn_features['Rank'] = df_txn_features.LastTransactionNarrative.apply(txn_rank)
df_txn_features

Unnamed: 0,ClientID,LastTransactionNarrative,Rank
0,10001,MYWHEELS IE DUBLIN 2,1
1,10002,MAXOL/MACE BRENNAN'S NEWBRIDGE,4
2,10003,Finglas Autoparts Limit Dublin,1
3,10004,DOUBLETREE CHELSEA,1
4,10005,TAJ HOTELS INTERNATIONAL Aurangabad,1
5,10006,KNOCK SHRINE CHURCH CLAREMORRIS,2
6,10007,CLUB MEDITERRANEE L.AIDIPSOU EV,1
7,10008,XTRA VISION VENDING LI DUBLIN 24,2
8,10009,CORK ART SUPPLIES LTD CORK,1
9,10010,O'HEHIRS ATHLONE,1


## 2. Activity associated with a given transaction

In [5]:
words_hist = {}
for n in list(df_test_abt['LastTransactionNarrative']):
    if not isinstance(n, str):
        continue
    if n == 'nan':
        continue
    for token in str.split(n):
        if token in words_hist:
            words_hist[token] += 1
        else:
            words_hist[token] = 1

df_words_hist = pd.DataFrame(list(sorted(words_hist.items(), reverse=True)), columns=['Words', 'Freq'])
df_words_hist = df_words_hist.sort_values(by=["Freq"], ascending=False)

print("Top 10 and last 10 words in TXN narrative")
df_words_hist.head(10)
df_words_hist.tail(10)


Top 10 and last 10 words in TXN narrative


Unnamed: 0,Words,Freq
2084,DUBLIN,214
1682,HOTEL,148
1336,LTD,100
1035,NV,88
202,VEGAS,74
1612,INN,67
3306,&,67
2840,AIR,66
1417,LAS,61
2315,CO,55


Unnamed: 0,Words,Freq
1816,GCAPALL,1
1815,GCM,1
1814,GD,1
1813,GEA,1
1811,GENERAL,1
1810,GENESIS,1
603,SANKT-PETERBU,1
604,SANKEYS,1
1807,GFD646,1
0,zurich,1


In [6]:


dic_features = {
    # Feature => Keyworkds
    'gamber': ['CASINO', 'VEGAS', 'LAS', 'HOTEL/CASINO'],
    'luxurious': ['INN', 'SUITES', 'PLAZA', 'HILTON', 'ROYAL', 'HYATT', 'MARRIOTT', 'FAIRMONT', 'RESORT-WDW', 'RESORT/CASINLAS','RESORTS'],
    'golfer': ['GOLF'],
    'traveler': ['AIR', 'AIRWAY', 'Limitersd-travel.ie', 'Airport', 'EASYJET.COM', 'RYANAIR'],
    'gamer': ['PLAYSTATIONNETWORK'],
    'shopper': ['STORES', 'AMAZON.CO.UK', 'Amazon'],
    'cinephilia': ['NETFLIX.COM', 'MOVIE', 'MOVIES-AT.IE', 'MOVIEPLEX', 'MOVIES', 'ITUNES.COM/BILL'],
    'car_renter' : ['RENT', 'RENT-A-CAR']
}

def extract_payment_type(feture, narrative_tokens):
    for t in narrative_tokens:
        for f in dic_features[feture]:
            if f == t:
                return 1
    return 0    
    
df_payment_patterns = pd.DataFrame(columns=['ClientID', 
                                            'gamber', 
                                            'luxurious', 
                                            'golfer', 
                                            'traveler', 
                                            'gamer', 
                                            'shopper', 
                                            'cinephilia', 
                                            'car_renter'])
index = 0
for index, row in df_test_abt.iterrows():
    client_id = row['ClientID']
    txn_narrative_tokens = str.split(row['LastTransactionNarrative'])
    
    gamber = extract_payment_type('gamber', txn_narrative_tokens)
    luxurious = extract_payment_type('luxurious', txn_narrative_tokens)
    golfer = extract_payment_type('golfer', txn_narrative_tokens)
    traveler = extract_payment_type('traveler', txn_narrative_tokens)
    gamer = extract_payment_type('gamber', txn_narrative_tokens)
    shopper = extract_payment_type('shopper', txn_narrative_tokens)
    cinephilia = extract_payment_type('cinephilia', txn_narrative_tokens)
    car_renter = extract_payment_type('car_renter', txn_narrative_tokens)
    
    df_payment_patterns.loc[index] = [client_id, gamber, luxurious, golfer, traveler, gamer, shopper, cinephilia, car_renter]
    index += 1
 
print('All the golfer customers')
df_payment_patterns[df_payment_patterns['golfer'] == 1]


All the golfer customers


Unnamed: 0,ClientID,gamber,luxurious,golfer,traveler,gamer,shopper,cinephilia,car_renter
684,10685.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
756,10757.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
792,10793.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1659,11660.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1924,11925.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1958,11959.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_test_txn = pd.merge(df_txn_features, df_payment_patterns, on='ClientID')
df_test_txn

Unnamed: 0,ClientID,LastTransactionNarrative,Rank,gamber,luxurious,golfer,traveler,gamer,shopper,cinephilia,car_renter
0,10001,MYWHEELS IE DUBLIN 2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10002,MAXOL/MACE BRENNAN'S NEWBRIDGE,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10003,Finglas Autoparts Limit Dublin,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10004,DOUBLETREE CHELSEA,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10005,TAJ HOTELS INTERNATIONAL Aurangabad,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,10006,KNOCK SHRINE CHURCH CLAREMORRIS,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,10007,CLUB MEDITERRANEE L.AIDIPSOU EV,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,10008,XTRA VISION VENDING LI DUBLIN 24,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,10009,CORK ART SUPPLIES LTD CORK,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10010,O'HEHIRS ATHLONE,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Persist transaction features

In [8]:
pd.DataFrame.to_csv(df_test_txn, "../specs/clean/Test Sample/Model Build - AbastractBaseTable - Transactions.csv", encoding='utf-8', index=False)