# Imports

In [1]:
import math
import numpy as np
import pandas as pd

# Configuration

In [2]:
### PARAMETERS ###
nbr_records = 10000
split_perc = 0.8

## Pre-requisites ##
nbr_valid_records = math.floor(nbr_records * split_perc)
nbr_invalid_records = nbr_records - nbr_valid_records

print( 'Nbr of valid records: ', nbr_valid_records)
print( 'Nbr of invalid records: ', nbr_invalid_records)

df = pd.DataFrame({'app_id': np.arange(0, nbr_records), 'app_status': 'valid'})

pd.options.mode.chained_assignment = None # Avoid warning
df['app_status'][nbr_valid_records:nbr_records] = 'invalid'


Nbr of valid records:  8000
Nbr of invalid records:  2000


# Record Structure Definition
## 1. IpRange: Distance Server - Client


In [3]:
## Valid ##
ipAddressValid = np.random.poisson(4, nbr_valid_records)
ipAddress_coeff = 40000/ ( max(ipAddressValid) - min(ipAddressValid) )
ipAddressValid = np.floor(ipAddressValid * ipAddress_coeff)
ipAddressValid = pd.Series(ipAddressValid)
#ipAddressValid.plot.hist()
#print( ipAddressValid.describe() )

## Invalid ##
ipAddressInvalid = np.random.poisson(3, nbr_invalid_records)
ipAddress_coeff = 20000/ ( max(ipAddressInvalid) - min(ipAddressInvalid) )
ipAddressInvalid = 10000 + np.floor(ipAddressInvalid * ipAddress_coeff)
ipAddressInvalid = pd.Series(ipAddressInvalid)
#ipAddressInvalid.plot.hist()
#print( ipAddressInvalid.describe() )

ipAddress_column = ipAddressValid.append(ipAddressInvalid, ignore_index = True)
df = df.assign(ipAddress=ipAddress_column)

del ipAddressValid, ipAddressInvalid, ipAddress_coeff, ipAddress_column

#plt = df['ipAddress'].plot.hist()
#del plt

## 1.2. IpHop

In [5]:
## Valid ##
ipHopValid = np.random.poisson(1, nbr_valid_records)
ipHop_coeff = 3 / ( max(ipHopValid) - min(ipHopValid) )
ipHopValid = np.floor(ipHopValid * ipHop_coeff)
ipHopValid = pd.Series(ipHopValid)
#ipHopValid.plot.hist()
#print( ipHopValid.describe() )

## Invalid ##
ipHopInvalid = np.random.poisson(3, nbr_invalid_records)
ipHop_coeff = 4 / ( max(ipHopInvalid) - min(ipHopInvalid) )
ipHopInvalid = np.floor(ipHopInvalid * ipHop_coeff) + 1
ipHopInvalid = pd.Series(ipHopInvalid)
#ipHopInvalid.plot.hist()
#print( ipHopInvalid.describe() )

ipHop_column = ipHopValid.append(ipHopInvalid, ignore_index = True)
df = df.assign(ipHop=ipHop_column)

del ipHopValid, ipHopInvalid, ipHop_coeff, ipHop_column

#plt = df['ipHop'].plot.hist()
#del plt

## 1.3. Time Submit Sec

In [6]:
## Valid ##
timeSubmitSecValid = np.floor( np.random.normal(loc=20, scale=2, size=nbr_valid_records) )
timeSubmitSecValid = pd.Series(timeSubmitSecValid)
#timeSubmitSecValid.plot.hist()
#print( timeSubmitSecValid.describe() )

## Invalid ##
timeSubmitSecInvalid = np.floor( np.random.normal(loc=4, scale=2, size=nbr_invalid_records) )
timeSubmitSecInvalid = pd.Series(timeSubmitSecInvalid)
#timeSubmitSec.plot.hist()
#print( timeSubmitSec.describe() )

timeSubmitSec_column = timeSubmitSecValid.append(timeSubmitSecInvalid, ignore_index = True)
df = df.assign(timeSubmitSec=timeSubmitSec_column)

del timeSubmitSecValid, timeSubmitSecInvalid, timeSubmitSec_column

#plt = df['timeSubmitSec'].plot.hist()
#del plt

## 1.4. Browser

In [7]:
browsers = ["edge","chrome","safari","firefox","opera","vivaldi"]

## Valid ##
browserValid = np.floor( np.random.standard_gamma(2, nbr_valid_records) )
browserValid = pd.Series(browserValid)
#browserValid.plot.hist()
#print( browserValid.describe() )

## Invalid ##
browserInvalid = np.floor( 6 * np.random.power(3, size=nbr_invalid_records) )
browserInvalid = pd.Series(browserInvalid)
#browserInvalid.plot.hist()
#print( browserInvalid.describe() )

browser_column = browserValid.append(browserInvalid, ignore_index = True)
df = df.assign(browser=browser_column)

del browserValid, browserInvalid, browser_column

#plt = df['browser'].plot.hist()
#del plt

## 1.5. Packages

In [8]:
packages = ["pkg_alpha","pkg_beta","pkg_delta","pkg_epsilon","pkg_eta","pkg_gamma","pkg_iota","pkg_kappa","pkg_lambda","pkg_omega","pkg_omicron","pkg_sigma","pkg_theta","pkg_upsilon","pkg_zeta"]

## Valid ##
packageValid = np.floor( np.random.standard_gamma(2, nbr_valid_records) ) * 3
packageValid = pd.Series(packageValid)
#packageValid.plot.hist()
#print( packageValid.describe() )

## Invalid ##
packageInvalid = np.floor( 15 * np.random.power(3, size=nbr_invalid_records) )
packageInvalid = pd.Series(packageInvalid)
#packageInvalid.plot.hist()
#print( packageInvalid.describe() )

package_column = packageValid.append(packageInvalid, ignore_index = True)
df = df.assign(package=package_column)

del packageValid, packageInvalid, package_column

#plt = df['package'].plot.hist()
#del plt

## 1.6. In list

In [10]:
## Valid ##
inListValid = np.floor( np.random.standard_gamma(0.5, size=nbr_valid_records) )
inListValid = pd.Series(inListValid)
#inListValid.plot.hist()
#print( inListValid.describe() )

## Invalid ##
inListInvalid = np.floor( 4 * np.random.power(3, size=nbr_invalid_records) )
inListInvalid = pd.Series(inListInvalid)
#inListInvalid.plot.hist()
#print( inListInvalid.describe() )

inList_column = inListValid.append(inListInvalid, ignore_index = True)
df = df.assign(inList=inList_column)

del inListValid, inListInvalid, inList_column

#plt = df['inList'].plot.hist()
#del plt

## 1.7. Is Customer

In [11]:
## Valid ##
inCustValid = np.random.randint(0, 20, size=nbr_valid_records) % 4
inCustValid = np.where(inCustValid > 0, 0, 1)
inCustValid = pd.Series(inCustValid)
#inCustValid.plot.hist()
#print( inCustValid.describe() )

## Invalid ##
inCustInvalid = np.random.randint(0, 20, size=nbr_invalid_records) % 4
inCustInvalid = np.where(inCustInvalid > 0, 1, 0)
inCustInvalid = pd.Series(inCustInvalid)
#inCustInvalid.plot.hist()
#print( inCustInvalid.describe() )

inCust_column = inCustValid.append(inCustInvalid, ignore_index = True)
df = df.assign(inCust=inCust_column)
del inCustValid, inCustInvalid, inCust_column

#plt = df['inCust'].plot.hist()
#del plt

## 1.8. Day of the week

In [12]:
weekDays = ["monday","tuesday","wednesday","thursday","friday","saturday","sunday"]

## Valid ##
weekDayValid = np.floor( np.random.power(4, size=nbr_valid_records) * 7 )
weekDayValid = pd.Series(weekDayValid)
#weekDayValid.plot.hist()
#print( weekDayValid.describe() )

## Invalid ##
weekDayInvalid = np.floor( np.random.power(2, size=nbr_invalid_records) * 7 )
weekDayInvalid = np.where( weekDayInvalid < 4, weekDayInvalid + 3, weekDayInvalid)
weekDayInvalid = pd.Series(weekDayInvalid)
#weekDayInvalid.plot.hist()
#print( weekDayInvalid.describe() )

weekDay_column = weekDayValid.append(weekDayInvalid, ignore_index = True)
df = df.assign(weekDay=weekDay_column)
del weekDayValid, weekDayInvalid, weekDay_column

#plt = df['weekDay'].plot.hist()
#del plt

## 1.9. Hour of the day

In [13]:
## Valid - Morning ##
appHourAM = np.floor( np.random.standard_normal(nbr_valid_records) * 2) + 11
appHourAM = pd.Series(appHourAM)
#appHourAM.plot.hist()
#print( appHourAM.describe() )

## Valid - PM ##
appHourPM = np.floor( np.random.standard_normal(nbr_valid_records) * 2) + 22
appHourPM = np.where(appHourPM>23, appHourPM-23, appHourPM)
appHourPM = pd.Series(appHourPM)
#appHourPM.plot.hist()
#print( appHourPM.describe() )

appHourValid = appHourAM.append(appHourPM, ignore_index = True)
#appHourValid.plot.hist()
#print( appHourValid.describe() )

################################################

## Invalid - Morning ##
appHourAM = np.floor( np.random.standard_normal(nbr_invalid_records) * 2) + 9
appHourAM = pd.Series(appHourAM)
#appHourAM.plot.hist()
#print( appHourAM.describe() )

## Invalid - PM ##
appHourPM = np.floor( np.random.standard_normal(nbr_invalid_records) * 2) + 24
appHourPM = np.where(appHourPM>23, appHourPM-23, appHourPM)
appHourPM = pd.Series(appHourPM)
#appHourPM.plot.hist()
#print( appHourPM.describe() )

appHourInvalid = appHourAM.append(appHourPM, ignore_index = True)
#appHour_column.plot.hist()
#print( appHour_column.describe() )

appHour_column = appHourValid.append(appHourInvalid, ignore_index = True)
df = df.assign(appHour=appHour_column)

del appHourAM, appHourPM, appHourValid, appHourInvalid, appHour_column

#plt = df['appHour'].plot.hist()
#del plt

## Score

In [14]:
df = df.assign(score=0)

# Export to CSV

In [15]:
df.to_csv('datagen.csv', index=False)