In [1]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

In [2]:
with open('../datasets/fraudTrainWithExtZip.csv') as fd:
    numcols = len(fd.readline().split(','))
df = pd.read_csv('../datasets/fraudTrainWithExtZip.csv', usecols=range(1,numcols))

In [3]:
df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'zip_ext',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [4]:
display(df.head(5))

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,zip_ext,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,6204,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,8487,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,9323,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,9670,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,3197,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
num_records = 5000
scale_fraud_proportion = 20

In [6]:
# Reduce the number of records to above value
# We need to keep the proportion of fraud vs not fraud records the same.
fraud_prop = np.sum(df['is_fraud'] == 1)/len(df)

# Separate the fraud and not fraud records
df_fraud = df[ df['is_fraud'] == 1 ]
df_not_fraud = df[ df['is_fraud'] == 0 ]

# Get a sampling from each in the proportion and combine
fraud_count = int(num_records*fraud_prop)*scale_fraud_proportion
not_fraud_count = num_records - fraud_count

print(fraud_count, not_fraud_count)
# Sample the fraud and not fraud records
fraud_selected = df_fraud.sample(fraud_count, replace=False).reset_index(drop=True)
not_fraud_selected = df_not_fraud.sample(not_fraud_count, replace=False).reset_index(drop=True)

print(len(fraud_selected), len(not_fraud_selected))
# Combined df of both types

cdf = pd.concat([fraud_selected,not_fraud_selected])
print(len(cdf))


560 4440
560 4440
5000


In [7]:
df_fraud = cdf[ cdf['is_fraud'] == 1 ]
df_not_fraud = cdf[ cdf['is_fraud'] == 0 ]
print(len(fraud_selected), len(not_fraud_selected))

560 4440


In [8]:
# Shuffle the records
cdf = cdf.sample(frac=1).reset_index(drop=True)

In [9]:
def extract_transaction_info(rec):
    r = {}
    r['date'] = str(rec[1]['trans_date_trans_time']).split(' ')[0]
    r['cc_num'] = str(rec[1]['cc_num'])
    r['amount'] = str(rec[1]['amt'])
    r['first_name'] = str(rec[1]['first'])
    r['last_name'] = str(rec[1]['last'])
    if str(rec[1]['gender']) == 'M':
        r['gender'] = 'Male'
        r['pronoun'] = 'he'
        r['Pronoun'] = 'He'
    else:
        r['gender'] = 'Female'
        r['pronoun'] = 'she' 
        r['Pronoun'] = 'She'
    r['merchant'] = str(rec[1]['merchant'])
    if r['merchant'].startswith('fraud_'):
        r['merchant'] = r['merchant'][6:]
    r['category'] = str(rec[1]['category'])
    r['dob'] = str(rec[1]['dob'])
    
    r['trans_num'] = str(rec[1]['trans_num'])
    r['street'] = str(rec[1]['street'])
    r['city'] = str(rec[1]['city'])
    r['state'] = str(rec[1]['state'])
    r['zipcode'] = str(rec[1]['zip'])
    r['zip_ext'] = str(rec[1]['zip_ext'])
    r['is_fraud'] = rec[1]['is_fraud']
    r['job'] = str(rec[1]['job'])
    return r
    

In [10]:
basic_instruction = '''<s>
### Instruction:
The data provided below is for a credit card transaction made by {first_name}, {last_name} who is a {gender}. 
{first_name}, {last_name}'s Date of Birth is {dob} and {pronoun} lives in {street}, {city}, {state}, {zipcode}-{zip_ext}, and works as a {job}.
{Pronoun} used the Credit Card with the Number {cc_num} on {date} to buy goods categorized as {category} from the merchant {merchant}, and paid ${amount}.
'''

fraud_instruction = '''This is a fradulent transaction.'''
legit_instruction = '''This is a legitimate transaction.'''

end_instruction   = '''
Complete the Prompt given below using the above information and the additional Input information provided in the 
Input section. Provide your response as shown in the Response section.
'''
prompt1 = '''
### Input: 
Name: {first_name}, {last_name}
Credit Card Number: {cc_num}
Prompt: Provide the complete address including the zip code of this Credit Card owner.
'''
response1 = '''
### Response:
Street Address: {street}
City: {city}
State: {state},
Zipcode: {zipcode}-{zip_ext}
</s>
'''

prompt2 = '''
### Input: 
Credit Card: {cc_num}
Prompt: Provide the first name, last name and the address of the provided Credit Card owner.
'''
response2 = '''
### Response:
Name: {first_name}, {last_name}
Address: {street}
         {city}, {state}
         {zipcode}-{zip_ext}
</s>
'''
 
prompt3 = '''
### Input: 
Name: {first_name}, {last_name}
Prompt: Provide the category of goods purchased by the above named person.
'''
response3 = '''
### Response:
Name: {first_name}, {last_name} bought goods belonging to the category {category}.
</s>
'''

prompt4 = '''
### Input: 
Name: {merchant}
Prompt: Was this merchant ever defrauded?.
'''
response4 = '''
### Response:
Merchant: {merchant} was defrauded at least once on {date} for ${amount} amount.
</s>
'''

prompt5 = '''
### Input: 
Name: {first_name}, {last_name}
Prompt: Did the given person ever commit Credit Card Fraud? If so provide the date and the amount of fraud.
'''
response5 = '''
### Response:
{first_name}, {last_name} did commit Credit Card fraud on {date} for an amount of ${amount}.
</s>
'''

prompt6 = '''
### Input: 
Name: {merchant}
Prompt: Was this merchant ever defrauded?.
'''
response6 = '''
### Response:
Merchant: {merchant} was never defrauded.
</s>
'''

prompt7 = '''
### Input: 
Name: {first_name}, {last_name}
Prompt: Did the given person ever commit Credit Card Fraud? If so provide the date and the amount of fraud.
'''
response7 = '''
### Response:
{first_name}, {last_name} did not ever commit Credit Card fraud.
</s>
'''

In [11]:
general_prompts=[prompt1, prompt2, prompt3]
general_responses = [response1, response2, response3]
fraud_prompts=[prompt4, prompt5]
fraud_responses = [response4, response5]
legit_prompts=[prompt6, prompt7]
legit_responses = [response6, response7]

In [12]:
def create_training_datapoints(df, instructions, inputs, outputs, texts):
    def add_new_training_datapoint(i, pi, rs):
        text = i + pi + rs
        instructions.append(i)
        inputs.append(pi)
        outputs.append(rs)
        texts.append(text)
        return
    
    for rec in df.iterrows():
        r = extract_transaction_info(rec)
        
        bi = basic_instruction.format(**r)
        ei = end_instruction.format(**r)
        
        if r['is_fraud']:
            instruction = bi + fraud_instruction + ei
        else:
            instruction = bi + legit_instruction + ei
        
        # Get a random train type for this record. We have 5 types of prompt
        # and we get a random number to train this specific record in a certain
        # prompt. 
        tt = np.random.randint(1,6)
        if tt in [1, 2, 3]:
            pi = general_prompts[tt-1].format(**r)
            rs = general_responses[tt-1].format(**r)
        elif tt in [4, 5]:
            if r['is_fraud']:
                pi = fraud_prompts[tt-4].format(**r)
                rs = fraud_responses[tt-4].format(**r)
            else:
                pi = legit_prompts[tt-4].format(**r)
                rs = legit_responses[tt-4].format(**r)
        add_new_training_datapoint(instruction, pi, rs)

In [13]:
inputs       = []
outputs      = []
instructions = []
texts        = []

In [14]:
create_training_datapoints(cdf, instructions, inputs, outputs, texts)

In [15]:
print(len(texts), len(cdf), len(instructions))

5000 5000 5000


In [18]:
count=0
for text in texts:
    print(text)
    count +=1
    if count >= 20:
        break

<s>
### Instruction:
The data provided below is for a credit card transaction made by Jacqueline, Curry who is a Female. 
Jacqueline, Curry's Date of Birth is 1990-11-23 and she lives in 3047 Jeff Place, Marathon, TX, 79842-3591, and works as a Lexicographer.
She used the Credit Card with the Number 38588538868506 on 2020-02-10 to buy goods categorized as shopping_pos from the merchant Daugherty, Pouros and Beahan, and paid $9.76.
This is a legitimate transaction.
Complete the Prompt given below using the above information and the additional Input information provided in the 
Input section. Provide your response as shown in the Response section.

### Input: 
Name: Jacqueline, Curry
Prompt: Provide the category of goods purchased by the above named person.

### Response:
Name: Jacqueline, Curry bought goods belonging to the category shopping_pos.
</s>

<s>
### Instruction:
The data provided below is for a credit card transaction made by Christina, Rose who is a Female. 
Christina, Rose'

In [19]:
cdf['Instructions'] = instructions
cdf['Input'] = inputs
cdf['Output'] = outputs
cdf['text'] = texts
cdf = cdf.sample(frac=1).reset_index(drop=True)
cdf

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,zip_ext,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,Instructions,Input,Output,text
0,2019-07-06 18:31:52,3506040590383211,"fraud_Nicolas, Hills and McGlynn",entertainment,78.97,Michael,Orozco,M,13956 Hughes Causeway Suite 124,Alder,MT,59710,3318,45.1939,-112.0568,286,Chief of Staff,1989-03-09,f78fb6a2861cc1ecfdcaef34608cf0d5,1341599512,46.125969,-112.591133,0,<s>\n### Instruction:\nThe data provided below...,\n### Input: \nName: Schoen Ltd\nPrompt: Was t...,\n### Response:\nMerchant: Schoen Ltd was neve...,<s>\n### Instruction:\nThe data provided below...
1,2019-05-23 01:47:36,4003989662068504,fraud_Spinka Inc,grocery_net,78.16,Chris,White,M,98897 Bennett Lodge,Bessemer,AL,35022,7091,33.3224,-86.9657,71463,Radio broadcast assistant,1989-02-08,bd68dc23a0ec9862d3d2373a7a71c68b,1337737656,33.516155,-86.141047,0,<s>\n### Instruction:\nThe data provided below...,"\n### Input: \nName: Dakota, Maldonado\nPrompt...","\n### Response:\nName: Dakota, Maldonado bough...",<s>\n### Instruction:\nThe data provided below...
2,2019-03-25 22:19:01,5301645381939419,fraud_Heathcote LLC,shopping_net,932.41,Joe,Allen,M,368 Rollins Fort Apt. 076,Aurora,CO,80019,2286,39.7656,-104.7069,389246,Industrial/product designer,1938-08-07,10867f773da784ca129f9096e4193f3b,1332713941,39.976180,-105.346222,1,<s>\n### Instruction:\nThe data provided below...,"\n### Input: \nName: Cesar, Smith\nCredit Card...",\n### Response:\nStreet Address: 901 Ariel Poi...,<s>\n### Instruction:\nThe data provided below...
3,2019-09-08 18:06:03,30074693890476,fraud_Waelchi-Wolf,kids_pets,1.81,Kelsey,Richards,F,889 Sarah Station Suite 624,Holcomb,KS,67851,7694,37.9931,-100.9893,2691,Arboriculturist,1993-08-16,97fccc0587496db0f6f512efa88cd8a2,1347127563,38.685702,-101.362142,0,<s>\n### Instruction:\nThe data provided below...,\n### Input: \nName: Murray-Smitham\nPrompt: W...,\n### Response:\nMerchant: Murray-Smitham was ...,<s>\n### Instruction:\nThe data provided below...
4,2020-04-02 01:35:50,4060579726528237,fraud_Kutch and Sons,grocery_pos,100.53,Steven,Sanders,M,25955 Amy Via,Ozawkie,KS,66070,6203,39.2136,-95.4404,2661,Theatre director,1948-11-14,98bb8682d6478370b243578cb8b55320,1364866550,39.613417,-95.754739,0,<s>\n### Instruction:\nThe data provided below...,"\n### Input: \nName: Brandon, Pittman\nCredit ...",\n### Response:\nStreet Address: 2881 Frey Vil...,<s>\n### Instruction:\nThe data provided below...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2019-02-25 18:10:43,3587960728692500,fraud_Kuhic LLC,shopping_net,1021.53,Kathy,Hughes,F,02110 Lucas Freeway Suite 517,Battle Creek,IA,51006,5263,42.3327,-95.6045,1075,"Teacher, early years/pre",1997-01-02,09d16c1f7a60ce980e3fc0461c8b3c06,1330193443,42.253287,-95.225571,1,<s>\n### Instruction:\nThe data provided below...,"\n### Input: \nName: Nathan, Mendoza\nPrompt: ...","\n### Response:\nNathan, Mendoza did not ever ...",<s>\n### Instruction:\nThe data provided below...
4996,2019-07-28 22:27:31,6501313409525696,"fraud_Eichmann, Hayes and Treutel",travel,11.46,Carrie,Lewis,F,28925 Powell Mountains Apt. 762,Pecos,TX,79772,1298,31.4467,-103.5791,12747,"Merchandiser, retail",1987-05-05,a260b628b7f661a988058216446ff7b9,1343514451,31.487055,-102.820143,1,<s>\n### Instruction:\nThe data provided below...,\n### Input: \nCredit Card: 4170689372027579\n...,"\n### Response:\nName: Samuel, Frey\nAddress: ...",<s>\n### Instruction:\nThe data provided below...
4997,2019-07-08 22:50:43,3556613125071656,"fraud_Parker, Nolan and Trantow",entertainment,12.01,Jose,Vasquez,M,572 Davis Mountains,Lake Jackson,TX,77566,9243,29.0393,-95.4401,28739,Futures trader,1999-12-27,9cc61073cfc1d87f618db419aa7d4bc1,1341787843,29.922286,-96.382531,0,<s>\n### Instruction:\nThe data provided below...,"\n### Input: \nName: Amber, Lewis\nCredit Card...",\n### Response:\nStreet Address: 6296 John Key...,<s>\n### Instruction:\nThe data provided below...
4998,2019-04-07 00:44:48,6011252220172077,fraud_Rau and Sons,grocery_pos,84.64,Melvin,Wright,M,210 Dean Causeway Suite 130,Alton,IA,51003,4183,42.9782,-96.0173,1700,"Engineer, land",2001-12-19,c49418b12a7910f9bec025a1e29e03b9,1333759488,42.822608,-95.637010,0,<s>\n### Instruction:\nThe data provided below...,"\n### Input: \nName: Christopher, Horn\nCredit...",\n### Response:\nStreet Address: 956 Sanchez H...,<s>\n### Instruction:\nThe data provided below...


In [20]:
f"We have {len(cdf)} records, with unique credit cards={len(np.unique(cdf['cc_num']))}"

'We have 5000 records, with unique credit cards=920'

In [21]:
# Split this into 70 30 for training and testing
mask = np.random.rand(len(cdf)) < 0.7
traindf = cdf[mask]
testdf  = cdf[~mask]

In [22]:
traindf.to_csv('../datasets/credit_card_fraud_train_dataset_v2.csv', index=False)

In [23]:
sanitydf = pd.read_csv('../datasets/credit_card_fraud_train_dataset_v2.csv')

In [24]:
x=sanitydf.sample(10)
for msg in x['text']:
    print(msg)

<s>
### Instruction:
The data provided below is for a credit card transaction made by Krystal, Gamble who is a Female. 
Krystal, Gamble's Date of Birth is 1964-02-15 and she lives in 47152 Clayton Burg, Manchester, MD, 21102-9492, and works as a Clinical research associate.
She used the Credit Card with the Number 180065479077096 on 2019-10-27 to buy goods categorized as grocery_pos from the merchant Cole PLC, and paid $84.56.
This is a legitimate transaction.
Complete the Prompt given below using the above information and the additional Input information provided in the 
Input section. Provide your response as shown in the Response section.

### Input: 
Name: Cole PLC
Prompt: Was this merchant ever defrauded?.

### Response:
Merchant: Cole PLC was never defrauded.
</s>

<s>
### Instruction:
The data provided below is for a credit card transaction made by Gary, Martinez who is a Male. 
Gary, Martinez's Date of Birth is 1997-03-12 and he lives in 03512 Jackson Ports, Reno, NV, 89512-997

In [25]:
testdf.to_csv('../datasets/credit_card_fraud_test_dataset_v2.csv', index=False)

In [26]:
print(len(texts))

5000


In [27]:
max_len = 0
for text in texts:
    if len(text) > max_len:
        max_len = len(text)
print(max_len)

985
