In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import math as math
import random
from faker.providers.person.en import Provider

In [2]:
file = 'realdata.tsv'
poaFile = 'poamatches.tsv' #optional
out_file = 'generated_data.tsv'

In [3]:
company_type_dict = {
'acxiom':'broker',
'ancestry' :'consumer',
'beenverified': 'broker',
'clubhouse' : 'social',
'infinitemediaconcepts':'broker',
'intel': 'broker',
'mcdonalds' :'consumer',
'mediaocean': 'broker',
'mylife': 'broker',
'neustar' : 'broker',
'oracle' : 'broker',
'quora' : 'social',
'target' : 'consumer',
'tesla' : 'consumer',
'thomsonreuters':'broker',
'tmobile' : 'telco',
'twitter' : 'social',  
'uber': 'consumer',
'walgreens':'consumer',
'walmart':	'consumer',
'xfinity':	'telco',    
}

In [4]:
companies = list(company_type_dict.keys())
cols = ['onboardID', 'company1', 'company2'] + companies
data = {}

In [5]:
numPeople = 200 # number of rows to generate

In [6]:
# Generate names
names = np.random.choice(getattr(Provider, "first_names"), size=numPeople)
data["name"] = names

In [7]:
# generate onboardIDs
#oids = np.random.randint(15, high=1000, size=numPeople)
oids = random.sample(range(10,numPeople*10), numPeople) #generate random IDs
data["onboardID"] = oids

In [8]:
# initialize empty match matrix
for company in companies:
    data[company] = [0] * numPeople

In [9]:
data = pd.DataFrame.from_dict(data)
data = data.set_index('onboardID')
data.head()

Unnamed: 0_level_0,acxiom,ancestry,beenverified,clubhouse,infinitemediaconcepts,intel,mcdonalds,mediaocean,mylife,name,...,quora,target,tesla,thomsonreuters,tmobile,twitter,uber,walgreens,walmart,xfinity
onboardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1615,0,0,0,0,0,0,0,0,0,Wanda,...,0,0,0,0,0,0,0,0,0,0
337,0,0,0,0,0,0,0,0,0,Shakira,...,0,0,0,0,0,0,0,0,0,0
1887,0,0,0,0,0,0,0,0,0,Ardelia,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,Devyn,...,0,0,0,0,0,0,0,0,0,0
965,0,0,0,0,0,0,0,0,0,Javen,...,0,0,0,0,0,0,0,0,0,0


## Generate account pairings
 generate account pairings. This is the tricky part.
 Some companies are rare (few people have an account with Tesla, e.g.). 
 This fact is important for testing, so we need to simulate something close.


In [10]:
# First let's see which companies are rare from the real data we already have.
people_sample = pd.read_csv(file, delimiter="\t", index_col='onboardID')
people_sample.sample(3)

Unnamed: 0_level_0,acxiom,ancestry,beenverified,clubhouse,infinitemediaconcepts,intel,mcdonalds,mediaocean,mylife,name,...,quora,target,tesla,thomsonreuters,tmobile,twitter,uber,walgreens,walmart,xfinity
onboardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
424,0,0,0,1,0,0,0,0,0,Jeannie,...,0,0,0,1,0,1,1,1,0,0
146,0,0,0,0,0,0,0,0,0,Bobbie,...,1,0,1,0,1,1,1,1,0,0
683,0,0,0,0,0,0,0,0,0,Bridgette,...,0,1,0,0,0,1,0,1,1,1


In [14]:
# Second, find the account distribution of companies.
rare_companies = people_sample[companies].sum().sort_values()
rare_companies = rare_companies/len(people_sample.index)
rare_companies

beenverified             0.01
neustar                  0.01
mediaocean               0.01
mylife                   0.01
infinitemediaconcepts    0.02
tesla                    0.04
acxiom                   0.05
thomsonreuters           0.06
intel                    0.08
xfinity                  0.12
clubhouse                0.12
oracle                   0.13
ancestry                 0.14
mcdonalds                0.15
quora                    0.22
walmart                  0.28
target                   0.29
walgreens                0.32
tmobile                  0.38
uber                     0.38
twitter                  0.48
dtype: float64

In [18]:
# Now we can build a pool of companies that we can select for ppl
company_pool = ((rare_companies * numPeople + 1) // 1).to_dict()
l = []
for company, count in company_pool.items():
 l.extend([company] * int(count))
random.shuffle(l)
company_pool = l * 2

In [19]:
# Randomly assign a handful of accounts to people
random.shuffle(company_pool)

for person in oids:
    n_accounts = random.randint(1,6) # pick how many new accounts
    for i in range(n_accounts):
        company = company_pool.pop() 
        data.at[person, company] = 1 #assign the new account
    random.shuffle(company_pool)

In [20]:
# check out the distribution of the new accounts
display("number of people", numPeople)
data[companies].sum().sort_values()

'number of people'

200

beenverified               5
mylife                     5
neustar                    7
mediaocean                 7
infinitemediaconcepts     10
tesla                     17
acxiom                    24
thomsonreuters            29
intel                     33
xfinity                   41
clubhouse                 47
oracle                    47
ancestry                  58
mcdonalds                 59
quora                     68
target                    90
walmart                   96
walgreens                105
uber                     111
tmobile                  112
twitter                  124
dtype: int64

In [21]:
data.head()

Unnamed: 0_level_0,acxiom,ancestry,beenverified,clubhouse,infinitemediaconcepts,intel,mcdonalds,mediaocean,mylife,name,...,quora,target,tesla,thomsonreuters,tmobile,twitter,uber,walgreens,walmart,xfinity
onboardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1615,0,0,0,0,0,0,1,0,0,Wanda,...,0,0,0,0,1,1,1,1,0,0
337,0,0,0,0,0,0,1,0,0,Shakira,...,0,1,0,1,0,0,0,0,0,0
1887,0,0,0,0,0,0,0,0,0,Ardelia,...,0,0,1,1,1,1,1,1,1,1
12,0,0,0,0,0,0,0,0,0,Devyn,...,0,0,1,0,1,0,1,0,1,0
965,0,0,0,0,0,0,1,0,0,Javen,...,1,1,0,0,0,0,1,0,0,0


## Put the matches in a handy format for exporting

In [None]:
data.to_csv(out_file, sep="\t")