In [136]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

In [137]:
with open('../datasets/fraudTrainWithExtZip.csv') as fd:
    numcols = len(fd.readline().split(','))
df = pd.read_csv('../datasets/fraudTrainWithExtZip.csv', usecols=range(1,numcols))

In [138]:
df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'zip_ext',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [139]:
display(df.head(5))

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,zip_ext,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,6204,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,8487,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,9323,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,9670,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,3197,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [140]:
num_records = 5000
scale_fraud_proportion = 20

In [141]:
# Reduce the number of records to above value
# We need to keep the proportion of fraud vs not fraud records the same.
fraud_prop = np.sum(df['is_fraud'] == 1)/len(df)

# Separate the fraud and not fraud records
df_fraud = df[ df['is_fraud'] == 1 ]
df_not_fraud = df[ df['is_fraud'] == 0 ]

# Get a sampling from each in the proportion and combine
fraud_count = int(num_records*fraud_prop)*scale_fraud_proportion
not_fraud_count = num_records - fraud_count

print(fraud_count, not_fraud_count)
# Sample the fraud and not fraud records
fraud_selected = df_fraud.sample(fraud_count, replace=False).reset_index(drop=True)
not_fraud_selected = df_not_fraud.sample(not_fraud_count, replace=False).reset_index(drop=True)

print(len(fraud_selected), len(not_fraud_selected))
# Combined df of both types

cdf = pd.concat([fraud_selected,not_fraud_selected])
print(len(cdf))


560 4440
560 4440
5000


In [142]:
df_fraud = cdf[ cdf['is_fraud'] == 1 ]
df_not_fraud = cdf[ cdf['is_fraud'] == 0 ]
print(len(fraud_selected), len(not_fraud_selected))

560 4440


In [143]:
# Shuffle the records
cdf = cdf.sample(frac=1).reset_index(drop=True)

In [144]:
def extract_transaction_info(rec):
    r = {}
    r['date'] = str(rec[1]['trans_date_trans_time']).split(' ')[0]
    r['cc_num'] = str(rec[1]['cc_num'])
    r['amount'] = str(rec[1]['amt'])
    r['first_name'] = str(rec[1]['first'])
    r['last_name'] = str(rec[1]['last'])
    if str(rec[1]['gender']) == 'M':
        r['gender'] = 'Male'
        r['pronoun'] = 'he'
        r['Pronoun'] = 'He'
    else:
        r['gender'] = 'Female'
        r['pronoun'] = 'she' 
        r['Pronoun'] = 'She'
    r['merchant'] = str(rec[1]['merchant'])
    if r['merchant'].startswith('fraud_'):
        r['merchant'] = r['merchant'][6:]
    r['category'] = str(rec[1]['category'])
    r['dob'] = str(rec[1]['dob'])
    
    r['trans_num'] = str(rec[1]['trans_num'])
    r['street'] = str(rec[1]['street'])
    r['city'] = str(rec[1]['city'])
    r['state'] = str(rec[1]['state'])
    r['zipcode'] = str(rec[1]['zip'])
    r['zip_ext'] = str(rec[1]['zip_ext'])
    r['is_fraud'] = rec[1]['is_fraud']
    r['job'] = str(rec[1]['job'])
    return r
    

In [145]:
basic_instruction = '''<s>
### Instruction:
A credit card transaction was made by {gender}, named {first_name}, {last_name}  
whose Birthdate is {dob} and {pronoun} lives at {street}, {city}, {state}, {zipcode}-{zip_ext}, works as a {job}.
{Pronoun} used the Credit Card Number {cc_num} on {date} to buy goods categorized as {category} from merchant {merchant}, amount ${amount}.
'''

fraud_instruction = '''This is a fradulent transaction.'''
legit_instruction = '''This is a legitimate transaction.'''

end_instruction   = '''
Complete the Prompt below using the above information and the Input information from the 
Input section. Provide your response as shown in the Response section.
'''
prompt1 = '''
### Input: 
Name: {first_name}, {last_name}
Credit Card Number: {cc_num}
Prompt: Provide the complete address, include the zip code of the Credit Card owner.
'''
response1 = '''
### Response:
Street Address: {street}
City: {city}
State: {state},
Zipcode: {zipcode}-{zip_ext}
</s>
'''

prompt2 = '''
### Input: 
Credit Card: {cc_num}
Prompt: Provide the first name, last name and the address of the Credit Card owner.
'''
response2 = '''
### Response:
Name: {first_name}, {last_name}
Address: {street}
         {city}, {state}
         {zipcode}-{zip_ext}
</s>
'''
 
prompt3 = '''
### Input: 
Name: {first_name}, {last_name}
Prompt: Provide the category of goods purchased by the above named person.
'''
response3 = '''
### Response:
Name: {first_name}, {last_name} bought goods belonging to the category {category}.
</s>
'''

prompt4 = '''
### Input: 
Name: {merchant}
Prompt: Was this merchant ever defrauded?.
'''
response4 = '''
### Response:
Merchant: {merchant} was defrauded at least once on {date} for ${amount} amount.
</s>
'''

prompt5 = '''
### Input: 
Name: {first_name}, {last_name}
Prompt: Did the given person ever commit Credit Card Fraud? If so provide the date and amount of fraud.
'''
response5 = '''
### Response:
{first_name}, {last_name} did commit Credit Card fraud on {date} for an amount of ${amount}.
</s>
'''

prompt6 = '''
### Input: 
Name: {merchant}
Prompt: Was this merchant ever defrauded?.
'''
response6 = '''
### Response:
Merchant: {merchant} was never defrauded.
</s>
'''

prompt7 = '''
### Input: 
Name: {first_name}, {last_name}
Prompt: Did the given person ever commit Credit Card Fraud? If so provide the date and amount of fraud.
'''
response7 = '''
### Response:
{first_name}, {last_name} did not ever commit Credit Card fraud.
</s>
'''

In [146]:
general_prompts=[prompt1, prompt2, prompt3]
general_responses = [response1, response2, response3]
fraud_prompts=[prompt4, prompt5]
fraud_responses = [response4, response5]
legit_prompts=[prompt6, prompt7]
legit_responses = [response6, response7]

In [147]:
def create_training_datapoints(df, instructions, inputs, outputs, texts):
    def add_new_training_datapoint(i, pi, rs):
        text = i + pi + rs
        instructions.append(i)
        inputs.append(pi)
        outputs.append(rs)
        texts.append(text)
        return
    
    for rec in df.iterrows():
        r = extract_transaction_info(rec)
        
        bi = basic_instruction.format(**r)
        ei = end_instruction.format(**r)
        
        if r['is_fraud']:
            instruction = bi + fraud_instruction + ei
        else:
            instruction = bi + legit_instruction + ei
        
        # Get a random train type for this record. We have 5 types of prompt
        # and we get a random number to train this specific record in a certain
        # prompt. 
        tt = np.random.randint(1,6)
        if tt in [1, 2, 3]:
            pi = general_prompts[tt-1].format(**r)
            rs = general_responses[tt-1].format(**r)
        elif tt in [4, 5]:
            if r['is_fraud']:
                pi = fraud_prompts[tt-4].format(**r)
                rs = fraud_responses[tt-4].format(**r)
            else:
                pi = legit_prompts[tt-4].format(**r)
                rs = legit_responses[tt-4].format(**r)
        add_new_training_datapoint(instruction, pi, rs)

In [148]:
inputs       = []
outputs      = []
instructions = []
texts        = []

In [149]:
create_training_datapoints(cdf, instructions, inputs, outputs, texts)

In [150]:
print(len(texts), len(cdf), len(instructions))

5000 5000 5000


In [151]:
count=0
for text in texts:
    print(text)
    count +=1
    if count >= 20:
        break

<s>
### Instruction:
A credit card transaction was made by Female, named Amber, Lewis  
whose Birthdate is 2004-05-08 and she lives at 6296 John Keys Suite 858, Pembroke Township, IL, 60958-3189, works as a Psychotherapist, child.
She used the Credit Card Number 4587657402165341815 on 2019-12-25 to buy goods categorized as personal_care from merchant Gottlieb-Hansen, amount $3.86.
This is a legitimate transaction.
Complete the Prompt below using the above information and the Input information from the 
Input section. Provide your response as shown in the Response section.

### Input: 
Credit Card: 4587657402165341815
Prompt: Provide the first name, last name and the address of the Credit Card owner.

### Response:
Name: Amber, Lewis
Address: 6296 John Keys Suite 858
         Pembroke Township, IL
         60958-3189
</s>

<s>
### Instruction:
A credit card transaction was made by Female, named Kimberly, Miller  
whose Birthdate is 1976-06-15 and she lives at 75533 Tamara Valleys, Logan

In [152]:
cdf['Instructions'] = instructions
cdf['Input'] = inputs
cdf['Output'] = outputs
cdf['text'] = texts
cdf = cdf.sample(frac=1).reset_index(drop=True)
cdf

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,zip_ext,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,Instructions,Input,Output,text
0,2020-06-17 03:05:46,4066595222529,"fraud_Rutherford, Homenick and Bergstrom",grocery_net,53.26,Jamie,Olson,F,3423 Michael Canyon Suite 276,River,KY,41254,4816,37.8795,-82.7251,571,Quarry manager,1931-03-07,f5660782eb26482500a06d0d932363e3,1371438346,38.853671,-83.003661,0,<s>\n### Instruction:\nA credit card transacti...,"\n### Input: \nName: Rutherford, Homenick and ...","\n### Response:\nMerchant: Rutherford, Homenic...",<s>\n### Instruction:\nA credit card transacti...
1,2019-10-20 16:34:15,342952484382519,fraud_Boyer-Reichert,shopping_net,7.16,Kayla,Jones,F,6033 Young Track Suite 804,East Canaan,CT,6024,5429,42.0158,-73.2913,647,Comptroller,1987-09-26,6a01692155edd18dd7b40faf75febc1b,1350750855,42.227507,-73.985372,0,<s>\n### Instruction:\nA credit card transacti...,"\n### Input: \nName: Kayla, Jones\nPrompt: Pro...","\n### Response:\nName: Kayla, Jones bought goo...",<s>\n### Instruction:\nA credit card transacti...
2,2019-03-18 17:38:30,6011148190095209,fraud_Sporer-Keebler,personal_care,61.39,Terri,Bailey,F,508 Erin Mount,Daly City,CA,94015,7877,37.6787,-122.4780,107941,"Buyer, industrial",1991-10-04,0741a351dce479138a0bf03d9073c794,1332092310,37.077696,-122.683552,0,<s>\n### Instruction:\nA credit card transacti...,\n### Input: \nCredit Card: 6011148190095209\n...,"\n### Response:\nName: Terri, Bailey\nAddress:...",<s>\n### Instruction:\nA credit card transacti...
3,2019-05-25 23:17:40,38295635583927,fraud_Ziemann-Waters,health_fitness,86.86,Candice,Brown,F,9412 Harris Mews,O Brien,TX,79539,6591,33.3749,-99.8473,178,Warden/ranger,1983-06-14,63f685562049141c3a131c5afa7787c0,1337987860,33.316405,-99.537425,0,<s>\n### Instruction:\nA credit card transacti...,"\n### Input: \nName: Candice, Brown\nPrompt: D...","\n### Response:\nCandice, Brown did not ever c...",<s>\n### Instruction:\nA credit card transacti...
4,2019-03-16 04:46:50,501899453424,fraud_Dickinson Ltd,misc_pos,1.14,Jessica,Dominguez,F,06393 Nancy Parkways Suite 855,Gadsden,AL,35903,3851,33.9845,-85.9077,67082,Ceramics designer,1970-01-08,670d0d8d76160260fde4699d1a465cf1,1331873210,33.301826,-85.375975,0,<s>\n### Instruction:\nA credit card transacti...,"\n### Input: \nName: Jessica, Dominguez\nPromp...","\n### Response:\nName: Jessica, Dominguez boug...",<s>\n### Instruction:\nA credit card transacti...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2019-06-24 21:08:25,4259996134423,fraud_Breitenberg LLC,travel,9.73,Julie,Johnson,F,9331 Robert Passage Suite 327,Hudson,KS,67545,5583,38.1485,-98.6408,215,Probation officer,1968-11-22,3826836c471aadca6f2d99e755fdd111,1340572105,38.473970,-98.824984,0,<s>\n### Instruction:\nA credit card transacti...,"\n### Input: \nName: Julie, Johnson\nPrompt: D...","\n### Response:\nJulie, Johnson did not ever c...",<s>\n### Instruction:\nA credit card transacti...
4996,2020-02-15 01:54:42,6506116513503136,fraud_Bartoletti-Wunsch,gas_transport,83.04,Kimberly,Rice,F,63991 Destiny Rue Apt. 651,Tyler,TX,75703,3663,32.2768,-95.3031,144160,Sports development officer,1984-05-04,8b49122e1dd557cc85473ce4bf28b603,1360893282,33.238070,-96.144998,0,<s>\n### Instruction:\nA credit card transacti...,\n### Input: \nName: Bartoletti-Wunsch\nPrompt...,\n### Response:\nMerchant: Bartoletti-Wunsch w...,<s>\n### Instruction:\nA credit card transacti...
4997,2019-01-29 14:46:41,3583635130604947,"fraud_Langosh, Wintheiser and Hyatt",food_dining,49.93,Crystal,Gamble,F,899 Michele View Suite 960,Philadelphia,PA,19149,2426,40.0369,-75.0664,1526206,Structural engineer,1985-01-01,d65783195062d894206bf6f1f6b3f1fe,1327848401,40.263500,-74.826792,0,<s>\n### Instruction:\nA credit card transacti...,"\n### Input: \nName: Crystal, Gamble\nCredit C...",\n### Response:\nStreet Address: 899 Michele V...,<s>\n### Instruction:\nA credit card transacti...
4998,2019-01-15 11:12:59,346208242862904,fraud_Predovic Inc,shopping_net,144.69,Tabitha,Reyes,F,20938 Barbara Viaduct,New York City,NY,10280,1890,40.7105,-74.0163,1577385,"Pharmacist, hospital",1961-05-13,7ae705a2c16be0484d9b17e566d1bc25,1326625979,39.942183,-73.161570,0,<s>\n### Instruction:\nA credit card transacti...,\n### Input: \nCredit Card: 346208242862904\nP...,"\n### Response:\nName: Tabitha, Reyes\nAddress...",<s>\n### Instruction:\nA credit card transacti...


In [153]:
f"We have {len(cdf)} records, with unique credit cards={len(np.unique(cdf['cc_num']))}"

'We have 5000 records, with unique credit cards=924'

In [154]:
# Split this into 70 30 for training and testing
mask = np.random.rand(len(cdf)) < 0.7
traindf = cdf[mask]
testdf  = cdf[~mask]

In [155]:
traindf.to_csv('../datasets/credit_card_fraud_train_dataset_v2.csv', index=False)

In [156]:
sanitydf = pd.read_csv('../datasets/credit_card_fraud_train_dataset_v2.csv')

In [157]:
x=sanitydf.sample(10)
for msg in x['text']:
    print(msg)

<s>
### Instruction:
A credit card transaction was made by Female, named Stephanie, Crane  
whose Birthdate is 1955-01-05 and she lives at 144 Martinez Curve, Central, IN, 47110-6094, works as a Counsellor.
She used the Credit Card Number 30487648872433 on 2020-02-14 to buy goods categorized as travel from merchant Tromp Group, amount $1.05.
This is a legitimate transaction.
Complete the Prompt below using the above information and the Input information from the 
Input section. Provide your response as shown in the Response section.

### Input: 
Name: Tromp Group
Prompt: Was this merchant ever defrauded?.

### Response:
Merchant: Tromp Group was never defrauded.
</s>

<s>
### Instruction:
A credit card transaction was made by Female, named Christina, Eaton  
whose Birthdate is 1986-11-12 and she lives at 3256 Brooks Field, Eldridge, AL, 35554-3545, works as a Politician's assistant.
She used the Credit Card Number 4067137330196900 on 2019-11-20 to buy goods categorized as personal_care

In [158]:
testdf.to_csv('../datasets/credit_card_fraud_test_dataset_v2.csv', index=False)

In [159]:
print(len(texts))

5000


In [160]:
max_len = 0
for text in texts:
    if len(text) > max_len:
        max_len = len(text)
print(max_len)

896


In [161]:
print (texts[0])

<s>
### Instruction:
A credit card transaction was made by Female, named Amber, Lewis  
whose Birthdate is 2004-05-08 and she lives at 6296 John Keys Suite 858, Pembroke Township, IL, 60958-3189, works as a Psychotherapist, child.
She used the Credit Card Number 4587657402165341815 on 2019-12-25 to buy goods categorized as personal_care from merchant Gottlieb-Hansen, amount $3.86.
This is a legitimate transaction.
Complete the Prompt below using the above information and the Input information from the 
Input section. Provide your response as shown in the Response section.

### Input: 
Credit Card: 4587657402165341815
Prompt: Provide the first name, last name and the address of the Credit Card owner.

### Response:
Name: Amber, Lewis
Address: 6296 John Keys Suite 858
         Pembroke Township, IL
         60958-3189
</s>

