# Synthetic data generator
## generation (04/10/2020), Version: 1


In [22]:
import pandas as pd
import numpy.random as rand
import numpy as np

## Generate Users
### Load usernames and passwords from csv files

In [5]:
usernames_csv = pd.read_csv(r"D:\temp\Akamai\datasets\usernames\extracted archive.zip\users.csv", header=0, names=['Value', 'Count'], dtype={'Value': str, 'Count': 'int32'}, nrows=200000)
passwords_csv = pd.read_csv(r"D:\temp\Akamai\datasets\passwords\haveibeenpwned.csv", names=['Value','Count'], skiprows=1)
useragents_csv = pd.read_csv(r"D:\temp\Akamai\datasets\SCRIPTS\user-agents-strings-with-counts.csv", header=0, names=['Value', 'Count'], dtype={'Value': str, 'Count': 'int32'})

### Some functions of samples

In [6]:
def uniform_sample(data, amount):
    return rand.choice(data['Value'], size=amount, replace=True)

def sample(data, distribution, amount):
    return rand.choice(data['Value'], size=amount, replace=True, p=distribution)

def unique_sample_replace(data, distribution, amount):
    return rand.choice(data['Value'], size=amount, replace=False, p=distribution)

def normal_distribution(size):
    nums = rand.default_rng().normal(1,1,size)
    nums = np.abs(nums)
    nums = nums/np.sum(nums)
    nums = -np.sort(-nums)
    return nums

### Main function to generate users

In [13]:
def generate_users(usernames, passwords, amount):
    new_usernames = unique_sample_replace(usernames, usernames['Count']/np.sum(usernames['Count']), amount)
    new_passwords = sample(passwords, passwords['Count']/np.sum(passwords['Count']), amount)

    return pd.DataFrame({'username': new_usernames, 'password': new_passwords})

### Start generating users.

In [14]:
amount_of_users = 20 *1000

users = generate_users(usernames_csv, passwords_csv, amount_of_users)

## Generate authentication request records
### Functions of similr strings, for simulating typos of users when entring passwords

In [15]:
def get_replaced_letter_in_string(string, index, new_letter):
    separated_string = list(string)
    separated_string[index] = new_letter
    return "".join(separated_string)

def get_additional_letter_in_string(string, index, new_letter):
    separated_string = list(string)
    separated_string.insert(index, new_letter)
    return "".join(separated_string)

def get_removed_letter_in_string(string, index):
    separated_string = list(string)
    del separated_string[index]
    return "".join(separated_string)

def new_letter():
    letters = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
    return rand.choice(letters)

def get_similar_string_of(string, other_passwords):
    change_type = rand.choice(["replace", "add", "rem", "other"], p=[0.4,0.25,0.25,0.1])
    similar_string = ""
    if change_type == "replace":
        similar_string = get_replaced_letter_in_string(string, rand.randint(0, len(string)), new_letter())
    elif change_type == "add":
        similar_string = get_additional_letter_in_string(string, rand.randint(0, len(string)), new_letter())
    elif change_type == "rem":
        similar_string = get_removed_letter_in_string(string, rand.randint(0, len(string)))
    elif change_type == "other":
        similar_string = rand.choice(other_passwords)

    return similar_string

### Main Functions of generation of authentication requests

In [16]:
def shuffle_dataframe(df):
    return df.sample(frac=1).reset_index(drop=True)

def generate_authentication_requests_for_user(user, passwords, useragents, precentage_of_fail, request_times, onfail_function):
    chosen_useragents = unique_sample_replace(useragents, useragents['Count']/np.sum(useragents['Count']), np.random.binomial(3, 0.1, 1)[0]+1)

    requests = []
    for time in request_times:
        sent_useragent = rand.choice(chosen_useragents)
        outcome = rand.choice(['Success', 'Fail'], p=[1-precentage_of_fail, precentage_of_fail])
        if outcome == 'Success':
             sent_password = user['password']
        else:
            sent_password = onfail_function(user['password'], passwords['Value'])
        requests.append([time, user['username'], sent_password, sent_useragent, outcome])
    
    return requests

def generate_authentication_requests(users, passwords, useragents, precentage_of_fail, amount, max_time, onfail_function):
    rng = rand.default_rng()
    amount_of_users = len(users.index)

    requests = []
    while len(requests) < amount:
        users = shuffle_dataframe(users)

        for index, user in users.iterrows():
            print('Amount of requests created:', len(requests), len(requests)/amount)
            if amount <= len(requests):
                break
            amount_of_user_requests = rng.binomial(amount/amount_of_users-1, 0.6)+1
            if amount_of_user_requests > amount-len(requests):
                amount_of_user_requests = int(amount-len(requests))
            request_times = rand.randint(0, max_time, amount_of_user_requests)
            user_requests = generate_authentication_requests_for_user(user, passwords, useragents, precentage_of_fail, request_times, onfail_function)
            requests = requests + user_requests

    return pd.DataFrame(requests, columns=['Time', 'Username', 'Password', 'Useragent', 'Outcome'])


### Start generating requests

In [17]:
precentage_of_fail = 0.01
amount_of_requests = 2* (10**6)
max_time = 500000

generated_requests = generate_authentication_requests(users, passwords_csv, useragents_csv, precentage_of_fail, amount_of_requests, max_time, lambda password, other_pass:  get_similar_string_of(password, other_pass)).sort_values('Time').reset_index()

len(generated_requests.index)

86609
Amount of requests created: 1973278 0.986639
Amount of requests created: 1973333 0.9866665
Amount of requests created: 1973394 0.986697
Amount of requests created: 1973454 0.986727
Amount of requests created: 1973510 0.986755
Amount of requests created: 1973567 0.9867835
Amount of requests created: 1973628 0.986814
Amount of requests created: 1973686 0.986843
Amount of requests created: 1973754 0.986877
Amount of requests created: 1973814 0.986907
Amount of requests created: 1973871 0.9869355
Amount of requests created: 1973928 0.986964
Amount of requests created: 1973989 0.9869945
Amount of requests created: 1974048 0.987024
Amount of requests created: 1974117 0.9870585
Amount of requests created: 1974182 0.987091
Amount of requests created: 1974233 0.9871165
Amount of requests created: 1974294 0.987147
Amount of requests created: 1974344 0.987172
Amount of requests created: 1974404 0.987202
Amount of requests created: 1974469 0.9872345
Amount of requests created: 1974532 0.9872

2000000

### Generating malicious requests

In [18]:
def get_mal_pass_distribution(size):
    nums = np.full(size, 1)
    nums = nums/size
    return nums


precentage_of_fail_mal = 0.98
amount_of_requests_mal = 0.4* (10**6)
max_time_mal = 350000
password_list_mal = passwords_csv[passwords_csv.index.isin(np.arange(100))] # top 100 passwords
password_distribution_mal = get_mal_pass_distribution(100)
fraction_of_users_known_to_mal = 0.2
users_mal = users.sample(frac=fraction_of_users_known_to_mal)
useragents_mal = useragents_csv[useragents_csv.index.isin(np.arange(75))]

generated_requests_mal = generate_authentication_requests(users_mal, password_list_mal, useragents_mal, precentage_of_fail_mal, amount_of_requests_mal, max_time_mal, lambda password, other_pass:  rand.choice(other_pass, p=password_distribution_mal)).sort_values('Time').reset_index()

len(generated_requests_mal.index)

uests created: 372637 0.9315925
Amount of requests created: 372698 0.931745
Amount of requests created: 372757 0.9318925
Amount of requests created: 372820 0.93205
Amount of requests created: 372883 0.9322075
Amount of requests created: 372945 0.9323625
Amount of requests created: 373002 0.932505
Amount of requests created: 373068 0.93267
Amount of requests created: 373121 0.9328025
Amount of requests created: 373183 0.9329575
Amount of requests created: 373240 0.9331
Amount of requests created: 373300 0.93325
Amount of requests created: 373356 0.93339
Amount of requests created: 373420 0.93355
Amount of requests created: 373491 0.9337275
Amount of requests created: 373551 0.9338775
Amount of requests created: 373598 0.933995
Amount of requests created: 373657 0.9341425
Amount of requests created: 373709 0.9342725
Amount of requests created: 373778 0.934445
Amount of requests created: 373840 0.9346
Amount of requests created: 373898 0.934745
Amount of requests created: 373954 0.934885


400000

### Save to CSV

In [19]:
columns_to_export = ['Time', 'Username', 'Password', 'Useragent', 'Outcome']
generated_requests.to_csv('generated2.csv', columns=columns_to_export, index=False)
generated_requests_mal.to_csv('generated_malicious2.csv', columns=columns_to_export, index=False)

# Normal to 1NF

In [11]:
from user_agents import parse

In [9]:
def split_ua(ua):
    ua = parse(ua)
    return [ua.browser.family, ua.browser.version_string,ua.device.family,ua.device.brand, ua.device.model, ua.os.family,ua.os.version_string]

In [116]:
datas = pd.read_csv('generated2.csv')
datas

Unnamed: 0,Time,Username,Password,Useragent,Outcome
0,0,cole_ya,pokemon,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Success
1,0,skibo_,02012006,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...,Success
2,1,AmazonianNewbie,rushhour,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...,Success
3,2,TheDanimal8888,azerty,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,Success
4,2,BleachedSkeleton,nathan8,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,Success
...,...,...,...,...,...
1999995,499999,sadlyEgyptian,agnieszka,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Success
1999996,499999,loved1youarewith,westside,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,Success
1999997,499999,JustADood,24111982,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Success
1999998,499999,cumspaces,110966,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Success


In [7]:
splitted_ua = pd.DataFrame(datas['Useragent'].apply(split_ua).values.tolist(), columns=['Browser Family', 'Browser Version', 'Device Family', 'Device Brand', 'Device Model','OS Family', 'OS Version'])

NameError: name 'pd' is not defined

In [118]:
data_w_splitted_ua = pd.concat([datas, splitted_ua], axis=1).drop('Useragent', axis=1)

In [120]:
columns_to_export = ['Time', 'Username', 'Password', 'Outcome', 'Browser Family', 'Browser Version', 'Device Family', 'Device Brand', 'Device Model','OS Family', 'OS Version']
data_w_splitted_ua.to_csv('generated2_ua_splitted.csv', columns=columns_to_export, index=False)