# Semantic Heterogeneous Database Simulations

Let's start generating random records and semantic operations, which will be used to execute performance tests

In [1]:
import time
import json
import random
import math
import pandas as pd
import numpy as np
from pymongo import MongoClient
from database_generator import DatabaseGenerator
from datetime import datetime
pd.options.mode.chained_assignment = None  # default='warn'

## Load Phase

Inserting all records generated in the semantic heterogeneous database. Please note PyMongo library may diminish performance of insertions. However, because the simulator internally uses it, it is fair to also use it on our baseline test, so these delays might net. 

Let's decide all variables for this test

In [2]:
number_of_records = 100000
number_of_versions = 5
number_of_fields = 11
number_of_values_in_domain=20

number_of_tests = 5
confidence_interval = 0.95

host = 'localhost'

performance_results = pd.DataFrame()

### First scenario
Inserting all records and adding the semantic operations afterwards. 

In [3]:
def first_scenario():    
    d = DatabaseGenerator()
    d.generate(number_of_records=number_of_records, number_of_versions=1, number_of_fields=number_of_fields,number_of_values_in_domain=number_of_values_in_domain)
    records = pd.DataFrame(d.records)

    start = time.time()
    d.collection.insert_many_by_dataframe(records, 'valid_from_date')

    for i in range(4):
        d.generate_version()        
    
    for operation in d.operations:    
        d.collection.execute_operation(operation[0],operation[1],operation[2])
    
    end = time.time()
    d.destroy()
    return (end - start)

for i in range(number_of_tests):
    time_taken = first_scenario()
    performance_results = performance_results.append({'stage' : 'LoadingPhase', 'experiment': 'insert-first','time':time_taken}, ignore_index=True)

### Second Scenario
Loading records only after inserting semantic operations in the collection

In [4]:
def second_scenario():    
    d = DatabaseGenerator()
    d.generate(number_of_records=number_of_records, number_of_versions=1, number_of_fields=number_of_fields,number_of_values_in_domain=number_of_values_in_domain)
    records = pd.DataFrame(d.records)

    start = time.time()
    d.collection.insert_many_by_dataframe(records.head(10), 'valid_from_date') #initial insert

    for i in range(4):
        d.generate_version()        
    
    for operation in d.operations:          
        d.collection.execute_operation(operation[0],operation[1],operation[2])

    d.collection.insert_many_by_dataframe(records.head(-10), 'valid_from_date')
    
    end = time.time()
    d.destroy()
    return (end - start)

In [5]:
for i in range(number_of_tests):
    time_taken = second_scenario()
    performance_results = performance_results.append({'stage' : 'LoadingPhase', 'experiment': 'operations-first','time':time_taken}, ignore_index=True)

KeyboardInterrupt: 

### Baseline Scenario

In a common database scenario, the records would be just inserted as they were generated (in raw format). User would have to deal with heterogeneity afterwards, in the querying fase. Therefore, for the loading phase, only generate raw records and bulk insert into the database. 

In [5]:
def third_scenario():    
    d = DatabaseGenerator()
    d.generate(number_of_records=number_of_records, number_of_versions=1, number_of_fields=number_of_fields,number_of_values_in_domain=number_of_values_in_domain)
    records = pd.DataFrame(d.records)

    start = time.time()
    
    client = MongoClient(host)
    client[d.database_name][d.collection_name].insert_many(d.records)
    
    end = time.time()
    d.destroy()
    return (end - start)

for i in range(number_of_tests):
    time_taken = third_scenario()
    performance_results = performance_results.append({'stage' : 'LoadingPhase', 'experiment': 'baseline','time':time_taken}, ignore_index=True)

## Querying tests

For the query tests, it does not matter which database (from first loading phase or from the second) is used. Both of them posess the same number of records, fields and domain values. Let's now analyse statistics in six different scenarios, just as in YCDB benchmark.

In [6]:
def generate_test_database_preinsert():
    ### Generate database just as before
    d = DatabaseGenerator()
    d.generate(number_of_records=number_of_records, number_of_versions=1, number_of_fields=number_of_fields,number_of_values_in_domain=number_of_values_in_domain)
    records = pd.DataFrame(d.records)

    
    d.collection.insert_many_by_dataframe(records, 'valid_from_date')

    for i in range(4):
        d.generate_version()        
    
    for operation in d.operations:    
        d.collection.execute_operation(operation[0],operation[1],operation[2])  
       
    
    return d

In order to correctly compare performance between the developed system and the baseline (an ordinary document-oriented database), this test performs a query rewriting before starting the time counter. It is important to notice, however, this test compares time performance only, regardless of usability gains achieved by this system. This query rewriting would have to be manually performed by the user, which would potentially also spend time.

In [7]:
def rewrite_query_baseline(query, database_generator):
    ors = list()

    for operation in database_generator:
        operationType, version_date, arguments = operation

        if operationType == 'grouping':
            pass
        elif operationType == 'translation':
            oldValue = arguments['from']
            newValue = arguments['to']
            field = arguments['field']

            if field not in query:
                continue
            else:
                ands = [{field : newValue}]
                

        else:
            raise BaseException('OperationType unknown')

In [8]:
def generate_test_database_postinsert():
    ### Generate database just as before
    d = DatabaseGenerator()
    d.generate(number_of_records=number_of_records, number_of_versions=1, number_of_fields=number_of_fields,number_of_values_in_domain=number_of_values_in_domain)
    records = pd.DataFrame(d.records)

    
    d.collection.insert_many_by_dataframe(records.head(10), 'valid_from_date') #initial insert

    for i in range(4):
        d.generate_version()        
    
    for operation in d.operations:          
        d.collection.execute_operation(operation[0],operation[1],operation[2])

    d.collection.insert_many_by_dataframe(records.head(-10), 'valid_from_date')    
    
    return d    

In [9]:
def update_and_read_test(percent_of_update):
    ### Generate database just as before    
    d = generate_test_database_preinsert()   
    original_records = d.records.copy()

    updates = math.floor(100*percent_of_update)
    reads = 100-updates

    sequence = ([True]*updates)
    sequence.extend([False]*reads)
    random.shuffle(sequence)    

    records = [d.generate_record() for i in range(updates)]
    records_2 = records.copy()

    queries = []   

    for i in range(reads):        
        field = (random.choice(d.fields))[0]
        value = random.choice(d.field_domain[field])
        queries.append({field:value})   

    queries_2 = queries.copy()     

    start = time.time()
    for operation in sequence:             
        if operation: # insert - Nos nossos casos de uso não faz muito sentido deleções e updates. 
            record = records.pop()
            start_2 = time.time()
            d.collection.insert_one(json.dumps(record, default=str),record['valid_from_date'])                        
            end_2 = time.time()
            #print('Insertion time:' + str(end_2-start_2))
        else:
            start_2 = time.time()
            d.collection.find_many(queries.pop()) ##Nao to considerando a presença ou ausência de índices
            end_2 = time.time()
            #print('Query time:' + str(end_2-start_2))

    end = time.time()
    print("Operations time: "+ str(end-start))
    ##Evolute everything in the end            
        
    end = time.time()
    preinsered_time = end-start
    d.destroy()

    client = MongoClient(host)        
    db = client[d.database_name]
    base_collection = db[d.collection_name]

    base_collection.insert_many(original_records)

    start = time.time()    
    for operation in sequence:             
        if operation: 
            record = records_2.pop()
            base_collection.insert_one(record)                      
        else:
            base_collection.find(queries_2.pop()) ##Isso nao faz exatamente sentido. Deveria gerar uma nova query 
    end = time.time()    
    baseline_time = (end-start)
    client.drop_database(d.database_name)
    
    return ({'preinserted': preinsered_time, 'baseline': baseline_time})
    

### Scenario 0 - Full Insertion - Lazy insertion

In [10]:
for i in range(number_of_tests):
    time_taken = update_and_read_test(1)
    performance_results = performance_results.append({'stage' : 'Full Insertion', 'experiment': 'baseline','time':time_taken['baseline']}, ignore_index=True)
    performance_results = performance_results.append({'stage' : 'Full Insertion', 'experiment': 'insert-first','time':time_taken['preinserted']}, ignore_index=True)

KeyboardInterrupt: 

In [11]:
performance_results

Unnamed: 0,stage,experiment,time
0,Full Insertion,baseline,0.049164
1,Full Insertion,insert-first,1.414969
2,Full Insertion,baseline,0.046623
3,Full Insertion,insert-first,1.588824
4,Full Insertion,baseline,0.045061
5,Full Insertion,insert-first,1.540153
6,Full Insertion,baseline,0.047287
7,Full Insertion,insert-first,1.487179
8,Full Insertion,baseline,0.047101
9,Full Insertion,insert-first,1.699324


### Scenario 0 - Full Insertion - Not lazy insertion

In [None]:
for i in range(number_of_tests):
    time_taken = update_and_read_test(1,False)
    performance_results = performance_results.append({'stage' : 'Full Insertion Nonlazy', 'experiment': 'baseline','time':time_taken['baseline']}, ignore_index=True)
    performance_results = performance_results.append({'stage' : 'Full Insertion Nonlazy', 'experiment': 'insert-first','time':time_taken['preinserted']}, ignore_index=True)

### Scenario 1 - 50/50

In the first update scenario, a workload of 50% of reads and 50% of writes.

Note there is a difference between the test using the first generation method and the second one, due to lazy records evolution adopted in the prototype.

In [None]:
for i in range(number_of_tests):
    time_taken = update_and_read_test(0.5)
    performance_results = performance_results.append({'stage' : '50/50', 'experiment': 'baseline','time':time_taken['baseline']}, ignore_index=True)
    performance_results = performance_results.append({'stage' : '50/50', 'experiment': 'insert-first','time':time_taken['preinserted']}, ignore_index=True)

Full Insertion (no query)

In [None]:
for i in range(number_of_tests):
    time_taken = update_and_read_test(1)
    performance_results = performance_results.append({'stage' : '50/50', 'experiment': 'baseline','time':time_taken['baseline']}, ignore_index=True)
    performance_results = performance_results.append({'stage' : '50/50', 'experiment': 'insert-first','time':time_taken['preinserted']}, ignore_index=True)

### Scenario 2 - Read Heavy

In this scenario, 95% of operations executed are read operations, while 5% are write operations (insertion).

In [None]:
for i in range(number_of_tests):
    time_taken = update_and_read_test(0.05)
    performance_results = performance_results.append({'stage' : 'ReadHeavy', 'experiment': 'baseline','time':time_taken['baseline']}, ignore_index=True)
    performance_results = performance_results.append({'stage' : 'ReadHeavy', 'experiment': 'insert-first','time':time_taken['preinserted']}, ignore_index=True)

### Scenario 3 - WriteHeavy

In this scenario, 95% of operations executed are write operations, while 5% are read operations.

In [None]:
for i in range(number_of_tests):
    time_taken = update_and_read_test(0.95)
    performance_results = performance_results.append({'stage' : 'WriteHeavy', 'experiment': 'baseline','time':time_taken['baseline']}, ignore_index=True)
    performance_results = performance_results.append({'stage' : 'WriteHeavy', 'experiment': 'insert-first','time':time_taken['preinserted']}, ignore_index=True)

In [None]:
performance_results