In [1]:
import time
import random

import pandas as pd
import numpy as np

from faker import Factory

In [2]:
fake = Factory.create()

In [3]:
companies = pd.read_csv('./companies.csv')

In [8]:
companies = companies.name.values

In [9]:
random.shuffle(companies)

In [34]:
def generate_people(n):
    """ Generates n-people, and returns a data frame.
        Each person has a:
            - id
            - age
            - name
            - address
            
        Two of them may be unexpected:
            - compromised: 0 or 1
            - compromised: if compromised, when
            
        These values get set by other functions."""
    
    ids = [i for i in range(n)]
    names = [fake.name() for n in range(n)]
    uuids = [fake.uuid4() for n in range(n)]
    ages = [random.randint(18, 80) for n in range(n)]
    addresses = [fake.address().replace('\n', ', ') for n in range(n)]
    compromised = list(np.zeros(n))
    compromised_time = list(np.zeros(n))
    
    df = pd.DataFrame([ids, uuids, names, ages, addresses, compromised, compromised_time]).T
    df.columns = ['id', 'uuid', 'name', 'age', 'address', 'compromised', 'compromised_time']
    return df

In [21]:
def generate_business(n, fraudster_likelihood=0.01):
    """ Generates n-businesses, and returns a data frame.
        Each person has a:
            - id
            - name
            - address
            
        The one weird one is "frauders_present" which is controlled by the fraudster_likelihood option.
            
        These values get set by other functions."""
    
    ids = [i for i in range(n)]
    names = list(companies[:n])
    addresses = [fake.address().replace('\n', ', ') for n in range(n)]
    fraudsters = [1 if random.random() < fraudster_likelihood else 0 for n in range(n) ]
    
    df = pd.DataFrame([ids, names, addresses, fraudsters]).T
    df.columns = ['id', 'name', 'address', 'fraudsters_present']
    return df

In [46]:
def generate_normal_transactions(people, businesses, max_transactions=40, fraud_likelihood=1, earliest="-1mon", latest="now", amin=9, amax=300):
    """ Generates normal transactions. Returns a list."""
    transactions = []
    
    # For each person
    for p in range(len(people)):
        
        person = people.ix[p]
        number_of_transactions = random.randint(1, max_transactions)
        
        # Generate soem transactions
        for i in range(number_of_transactions):

            # random time and business
            time = fake.date_time_between(start_date=earliest, end_date=latest, tzinfo=None)
            business = businesses.sample(1)
            amount = random.randint(amin, amax) + round(random.random(), 2)

            # if fraudsters are present, they may or may not steal
            # the users information
            if business.fraudsters_present.values[0] == 1:
                if random.random() < fraud_likelihood:
                    people.set_value(person.id, 'compromised', 1)
                    people.set_value(person.id, 'compromised_time', time)

            transaction = '{time} uuid={uuid} user="{user}" business="{business}" amount={amount} disputed=false'.format(
                    time = time,
                    uuid = person['uuid'],
                    user = person['name'],
                    business = business.name.values[0],
                    amount = amount)

            transactions.append(transaction)

    return transactions

In [47]:
def generate_fradulent_transactions(people, businesses, max_transactions=40, user_fraud_detection_likelihood=0.10, latest="+10d", amin=1, amax=3000):
    """ Genereates fradulent transactions. Returns a list."""
    transactions = []
    compromised = people[people.compromised == 1]
    
    for i in range(len(compromised)):
        
        person = compromised.iloc[i]
        earliest = person.compromised_time
        time = fake.date_time_between(start_date=earliest, end_date=latest, tzinfo=None)
        number_of_transactions = random.randint(5, max_transactions)
        
        for j in range(number_of_transactions):
    
            def fraud():        

                business = businesses.sample(1)
                amount = random.randint(amin, amax) + round(random.random(), 2)

                transaction = '{time} uuid={uuid} user="{user}" business="{business}" amount={amount} disputed=true'.format(
                    time = time,
                    uuid = person['uuid'],
                    user = person['name'],
                    business = business.name.values[0],
                    amount = amount)

                transactions.append(transaction)

            if j == 1:
                fraud()

            # Each fradulent transaction has +1% chance of being cause from the user
            # Imaginging they are maybe monitoring their transaction history
            # Or happen to be checking one thing or another
            if random.random() < j/100:
                fraud()
            
    return transactions

In [37]:
generate_people(3)

Unnamed: 0,id,uuid,name,age,address,compromised,compromised_time
0,0,ede39b42-d920-4219-8c24-c449cf95ce85,Meghan Arellano,39,"962 Danielle Shoal, North Emilyberg, CO 00663",0,0
1,1,97674712-3704-4801-992a-36210a2d27e3,Jade Davis,46,"58847 Anne Springs Apt. 480, Coreystad, KS 842...",0,0
2,2,f2a78982-d10e-4ce0-b0ee-84bd721f4512,Jason Randolph,74,"984 Vang Spring Suite 995, Ellishaven, OR 4087...",0,0


In [38]:
generate_business(3)

Unnamed: 0,id,name,address,fraudsters_present
0,0,AGCO Corporation,"3288 Hickman Mall Suite 872, Garzafurt, MT 210...",0
1,1,StoneMor Partners L.P.,"15617 Kenneth Fork, West Diana, KS 11678-8902",0
2,2,"Global Brass and Copper Holdings, Inc.","46556 Richard Dam, Leeside, MO 16521-9998",0


In [48]:
p = generate_people(100)
b = generate_business(1000)

In [49]:
b.fraudsters_present.sum()

11

In [63]:
t = generate_normal_transactions(p, b)

In [64]:
f = generate_fradulent_transactions(p, b)

In [65]:
# disputed_ratio
len(f) / (len(t) + len(f))

0.045342533267619514

In [66]:
t[:5], f[:5]

(['2016-08-05 14:36:38 uuid=29c0a14c-e7f9-4be3-9ca0-0906f20c1a5e user="Donna Benitez" business="General Growth Properties, Inc." amount=213.48 disputed=false',
  '2016-08-05 14:37:03 uuid=29c0a14c-e7f9-4be3-9ca0-0906f20c1a5e user="Donna Benitez" business="Energy Transfer Equity, L.P." amount=218.77 disputed=false',
  '2016-08-05 14:36:49 uuid=515b1965-c6c8-44bf-947f-37a395a9a6a3 user="Sharon Bradley" business="Cobalt International Energy, Inc." amount=29.01 disputed=false',
  '2016-08-05 14:36:27 uuid=22f5fbf7-1fba-4f9c-a6b7-ca913944c2b6 user="Patricia Joseph" business="Orion Engineered Carbons S.A" amount=292.71 disputed=false',
  '2016-08-05 14:36:38 uuid=22f5fbf7-1fba-4f9c-a6b7-ca913944c2b6 user="Patricia Joseph" business="HudBay Minerals Inc" amount=120.27 disputed=false'],
 ['2016-08-10 18:59:32 uuid=a576d115-620b-41f1-9890-58d1551bfd2d user="Holly Montoya" business="New Relic, Inc." amount=149.43 disputed=true',
  '2016-08-07 10:02:37 uuid=3f27b95e-8161-4ebc-bce4-a47175a98124 use

In [67]:
len(t) + len(f)

2029

In [68]:
t.extend(f)
len(t)

2029

In [73]:
with open('output.log', 'w') as f:
    for line in t:
        f.write(line)
        f.write('\n')

In [74]:
! ls

Untitled.ipynb     companylist.csv    output.log
[37mbin[m[m                [37minclude[m[m            pip-selfcheck.json
companies.csv      [37mlib[m[m                [37mshare[m[m


In [75]:
! head output.log

2016-08-05 14:36:38 uuid=29c0a14c-e7f9-4be3-9ca0-0906f20c1a5e user="Donna Benitez" business="General Growth Properties, Inc." amount=213.48 disputed=false
2016-08-05 14:37:03 uuid=29c0a14c-e7f9-4be3-9ca0-0906f20c1a5e user="Donna Benitez" business="Energy Transfer Equity, L.P." amount=218.77 disputed=false
2016-08-05 14:36:49 uuid=515b1965-c6c8-44bf-947f-37a395a9a6a3 user="Sharon Bradley" business="Cobalt International Energy, Inc." amount=29.01 disputed=false
2016-08-05 14:36:27 uuid=22f5fbf7-1fba-4f9c-a6b7-ca913944c2b6 user="Patricia Joseph" business="Orion Engineered Carbons S.A" amount=292.71 disputed=false
2016-08-05 14:36:38 uuid=22f5fbf7-1fba-4f9c-a6b7-ca913944c2b6 user="Patricia Joseph" business="HudBay Minerals Inc" amount=120.27 disputed=false
2016-08-05 14:36:17 uuid=22f5fbf7-1fba-4f9c-a6b7-ca913944c2b6 user="Patricia Joseph" business="DHT Holdings, Inc." amount=183.34 disputed=false
2016-08-05 14:37:07 uuid=a576d115-620b-41f1-9890-58d1551bfd2d user="Holly Montoya" busi