# Semantic Heterogeneous Database Simulations

Let's start generating random records and semantic operations, which will be used to execute performance tests

In [10]:
import time
import random
import pandas as pd
import numpy as np
from database_generator import DatabaseGenerator
from datetime import datetime

## Load Phase

Inserting all records generated in the semantic heterogeneous database. Please note PyMongo library may diminish performance of insertions. However, because the simulator internally uses it, it is fair to also use it on our baseline test, so these delays might net. 

Let's decide all variables for this test

In [2]:
number_of_records = 100000
number_of_versions = 5
number_of_fields = 11
number_of_values_in_domain=20

number_of_tests = 5
confidence_interval = 0.95

### First scenario
Inserting all records and adding the semantic operations afterwards. 

In [3]:
def first_scenario():    
    d = DatabaseGenerator()
    d.generate(number_of_records=number_of_records, number_of_versions=1, number_of_fields=number_of_fields,number_of_values_in_domain=number_of_values_in_domain)
    records = pd.DataFrame(d.records)

    start = time.time()
    d.collection.insert_many_by_dataframe(records, 'valid_from_date')

    for i in range(4):
        d.generate_version()        
    
    for operation in d.operations:    
        d.collection.execute_operation(operation[0],operation[1],operation[2])
    
    end = time.time()
    d.destroy()
    return (end - start)

In [4]:
first_scenario_times = list()

for i in range(number_of_tests):
    time_taken = first_scenario()
    first_scenario_times.append(time_taken)

### Second Scenario
Loading records only after inserting semantic operations in the collection

In [5]:
def second_scenario():    
    d = DatabaseGenerator()
    d.generate(number_of_records=number_of_records, number_of_versions=1, number_of_fields=number_of_fields,number_of_values_in_domain=number_of_values_in_domain)
    records = pd.DataFrame(d.records)

    start = time.time()
    d.collection.insert_many_by_dataframe(records.head(10), 'valid_from_date') #initial insert

    for i in range(4):
        d.generate_version()        
    
    for operation in d.operations:          
        d.collection.execute_operation(operation[0],operation[1],operation[2])

    d.collection.insert_many_by_dataframe(records.head(-10), 'valid_from_date')
    
    end = time.time()
    d.destroy()
    return (end - start)

In [6]:
second_scenario_times = list()

for i in range(number_of_tests):
    time_taken = second_scenario()
    second_scenario_times.append(time_taken)

('grouping', datetime.datetime(2053, 1, 13, 0, 0), {'fieldName': 'okrpm', 'oldValues': [3381864, 7431303], 'newValue': 9019951})
('grouping', datetime.datetime(2082, 8, 24, 0, 0), {'fieldName': 'jkovi', 'oldValues': [2446727, 3037643], 'newValue': 7654390})
('translation', datetime.datetime(2028, 10, 27, 0, 0), {'fieldName': 'tonfs', 'oldValue': '2027-07-22', 'newValue': '2054-08-14'})
('grouping', datetime.datetime(2096, 10, 6, 0, 0), {'fieldName': 'zrday', 'oldValues': ['wgkjjtrcnjsheueysanealcuznvxquihsu', 'xdtvnzrixkcbubihuzsrvdwidgdfswnfwr'], 'newValue': 'gqbdvpmegxsxkbcivibeskqanoxgsqzfdh'})
('translation', datetime.datetime(2094, 11, 12, 0, 0), {'fieldName': 'yuxox', 'oldValue': 'ihkkvvpafnobvcqvpkrybcffuookpjouxe', 'newValue': 'rzjfafaobcvcznluhcabjmyshvflaskyxw'})
('grouping', datetime.datetime(2034, 9, 5, 0, 0), {'fieldName': 'zpzbk', 'oldValues': ['2044-02-14', '2035-10-17'], 'newValue': '2094-03-06'})
('grouping', datetime.datetime(2082, 12, 23, 0, 0), {'fieldName': 'zpzbk'

### Baseline Scenario

In a common database scenario, the records would be just inserted as they were generated (in raw format). User would have to deal with heterogeneity afterwards, in the querying fase. Therefore, for the loading phase, only generate raw records and bulk insert into the database. 

In [7]:
def third_scenario():    
    d = DatabaseGenerator()
    d.generate(number_of_records=number_of_records, number_of_versions=1, number_of_fields=number_of_fields,number_of_values_in_domain=number_of_values_in_domain)
    records = pd.DataFrame(d.records)

    start = time.time()
    d.collection.insert_many_by_dataframe(records, 'valid_from_date') #initial insert    
    
    end = time.time()
    d.destroy()
    return (end - start)

In [8]:
third_scenario_times = list()

for i in range(number_of_tests):
    time_taken = third_scenario()
    third_scenario_times.append(time_taken)

## Querying tests

For the query tests, it does not matter which database (from first loading phase or from the second) is used. Both of them posess the same number of records, fields and domain values. Let's now analyse statistics in six different scenarios, just as in YCDB benchmark.

### Scenario 1 - Writes Heavy

In the first update scenario, a workload of 50% of reads and 50% of writes.

Note there is a difference between the test using the first generation method and the second one, due to lazy insertion.

In [9]:
def generate_test_database():
    ### Generate database just as before
    d = DatabaseGenerator()
    d.generate(number_of_records=number_of_records, number_of_versions=1, number_of_fields=number_of_fields,number_of_values_in_domain=number_of_values_in_domain)
    records = pd.DataFrame(d.records)

    
    d.collection.insert_many_by_dataframe(records, 'valid_from_date')

    for i in range(4):
        d.generate_version()        
    
    for operation in d.operations:    
        d.collection.execute_operation(operation[0],operation[1],operation[2])  
       
    
    return d

In [None]:
def write_heavy(percent_of_update):
    ### Generate database just as before
    d = generate_test_database()   

    ##Pensar numa maneira melhor de determinar essa ordem
    sequence = [random.random() > percent_of_update for i in range(10000)] #if true, update, if false, read

    start = time.time()
    for operation in sequence:
        if operation:
            pass # inserir
        else:
            pass #update 

    end = time.time()
    
    d.destroy()
    