<div style="margin-top:1px; margin-bottom:1px; padding:1px; background-color:#0f6319; fonf-family:Calibri, sans-serif; font-size:18px; font-weight:bold; color:#ffffff;">
    &nbsp;&sect;&nbsp;&nbsp;Initialize</div>

In [1]:
# Import libraries
import os
import time
import threading
import pandas
import ads
from tqdm import tqdm
from dotenv import load_dotenv
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from IPython.display import display, display_html, clear_output
# Global settings
pandas.options.display.max_columns = None
pandas.options.display.max_rows = 100

In [2]:
# Helpers
def run_timer_s(method):
    def wrapper(*args, **kw):
        print(f'Running {method.__name__}({args[0]}) ... '.ljust(66), end='')
        timer_start = time.time()
        result = method(*args, **kw)
        timer_end = time.time()
        print(f'Done ({(timer_end - timer_start):.1f} s)')
        return result
    return wrapper
def run_timer_d(method):
    def wrapper(*args, **kw):
        class MyTimer:
            def __init__(self, label):
                self.label = label
                self.start_time = None
                self.elapsed_time = None
                self.is_running = False
                self.thread = None
            def start(self):
                if self.is_running:
                    return
                self.start_time = time.time()
                self.is_running = True
                print(f'Running {self.label} ... '.ljust(66)+f'0.0 s')
                def update_timer():
                    while self.is_running:
                        elapsed_time = time.time() - self.start_time
                        clear_output(wait=True)
                        print(f'Running {self.label} ... '.ljust(66)+f'{elapsed_time:.1f} s')
                        time.sleep(0.1)
                self.thread = threading.Thread(target=update_timer)
                self.thread.start()
            def stop(self):
                if not self.is_running:
                    return
                self.elapsed_time = time.time() - self.start_time
                self.is_running = False
                self.thread.join()
                clear_output(wait=True)
                print(f'Running {self.label} ... '.ljust(66)+f'Done ({self.elapsed_time:.1f} s)')
        timer = MyTimer(label=f'{method.__name__}({args[0]})')
        timer.start()
        result = method(*args, **kw)
        timer.stop()
        return result
    return wrapper
@run_timer_d
def check_compute_performance(label, scale):
    result = 0
    for i in range(scale):
        for j in range(scale):
            result = result + i * j

In [3]:
# Check resources
check_compute_performance('VM.Standard2.16', 10000)

Running check_compute_performance(VM.Standard2.16) ...            Done (19.0 s)


<div style="margin-top:1px; margin-bottom:1px; padding:1px; background-color:#0f6319; fonf-family:Calibri, sans-serif; font-size:18px; font-weight:bold; color:#ffffff;">
    &nbsp;&sect;&nbsp;&nbsp;Get data (from database)</div>

In [4]:
@run_timer_s
def get_table_from_db(label, data, table, connection):
    query = f'SELECT * FROM {table}'
    data = pandas.DataFrame.ads.read_sql(query, connection_parameters=connection)
    data.index = range(1, data.shape[0]+1)
    return data
def get_tables_from_db_s(label, data, tables, connection):
    for table in tables:
        data[table] = get_table_from_db(table, data, table, connection)
    return data
@run_timer_d
def get_tables_from_db_d(label, data, tables, connection):
    for table in tables:
        query = f'SELECT * FROM {table}'
        data[table] = pandas.DataFrame.ads.read_sql(query, connection_parameters=connection)
        data[table].index = range(1, data[table].shape[0]+1)
    return data

In [5]:
# Load '.env' file
status = load_dotenv('env/adb.env')
print('Environment variables loaded successfully.') if status else print('Environment variables failed to load.')

Environment variables loaded successfully.


In [6]:
# Initialize
connection = {
    'user_name': os.environ.get('DB_USER'),
    'password': os.environ.get('DB_PASSWORD'),
    'service_name': os.environ.get('DB_SERVICE_NAME'),
    'wallet_location': f'{os.getcwd()}/env/Wallet_ADWWEST.zip' }
data_0 = {}
tables = ['TF_COMPANIES','TF_TRANSACTIONS']
# Get data
data = get_tables_from_db_d('All', data_0, tables, connection)

Running get_tables_from_db_d(All) ...                             Done (2.7 s)


In [7]:
# Helpers
def display_double(df_1, df_2, head):
    df_1_styler = df_1.head(head).style.set_table_attributes("style='display:inline'")
    df_1_styler = df_1_styler.set_properties(subset=['COMPANYID'],**{'text-align':'left'}).set_table_styles([dict(selector='th',props=[('text-align','left')])])
    df_2_styler = df_2.head(head).style.set_table_attributes("style='display:inline'")
    df_2_styler = df_2_styler.set_properties(subset=['TRANSACTIONID'],**{'text-align':'left'}).set_table_styles([dict(selector='th',props=[('text-align','left')])])
    display_html(f'{df_1_styler._repr_html_()}<div style="display:inline; margin:36px;">{df_2_styler._repr_html_()}', raw=True)
# Show
display_double(data_0['TF_COMPANIES'], data_0['TF_TRANSACTIONS'], min(data_0['TF_COMPANIES'].shape[0],data_0['TF_TRANSACTIONS'].shape[0],10))

Unnamed: 0,COMPANYID,COMPANYNAME
1,C310023084,Company #0079
2,C310028811,Company #0080
3,C310093693,Company #0081
4,C310176782,Company #0082
5,C310180199,Company #0083
6,C310202267,Company #0084
7,C310282839,Company #0085
8,C310308434,Company #0086
9,C310381605,Company #0087
10,C310460978,Company #0088

Unnamed: 0,TRANSACTIONID,TRANSACTIONDATE,COMPANYFROMID,COMPANYTOID,TRANSACTIONAMOUNT
1,T643859324,10.04.2017,C306634688,C306778540,59833
2,T643859349,18.04.2017,C306558208,C306634688,22335
3,T643859398,01.04.2017,C306634688,C306384932,31497
4,T643859453,20.04.2017,C306592341,C306634688,88868
5,T643859481,12.04.2017,C306487644,C306519118,47103
6,T643859508,23.04.2017,C306558208,C306634688,55494
7,T643859520,17.04.2017,C306634688,C306384932,33881
8,T643859558,17.04.2017,C306519118,C306558208,51635
9,T643859656,12.04.2017,C306519118,C306487644,93340
10,T643859660,18.04.2017,C306592341,C306714099,18456


<div style="margin-top:1px; margin-bottom:1px; padding:1px; background-color:#0f6319; fonf-family:Calibri, sans-serif; font-size:18px; font-weight:bold; color:#ffffff;">
    &nbsp;&sect;&nbsp;&nbsp;Explore data (pre-)</div>

In [8]:
# Helpers
def get_shape_for_table(data_shape, data, table):
    data_shape = data_shape.append({
        'TABLE':table, 'ROWS':data[table].shape[0], 'COLUMNS':data[table].shape[1] }, 
        ignore_index=True)
    return data_shape
def get_shape_for_tables(data, tables):
    data_shape = pandas.DataFrame(columns=['TABLE','ROWS','COLUMNS'])
    tqdm_loop = tqdm(tables, unit=' it')
    for table in tqdm_loop:
        tqdm_loop.set_description_str('S', refresh=True)
        data_shape = get_shape_for_table(data_shape, data, table)
    data_shape.index = range(1, data_shape.shape[0]+1)
    return data_shape
def get_content_for_table(data_content, data, table):
    for column in data[table].columns:
        count = data[table].shape[0]
        nulls = data[table][column].isna().sum()
        notnulls = count - nulls
        distinct = data[table][column].nunique()
        valuecounts = data[table][column].value_counts(dropna=False).head(5)
        values = [None] * 11; counts = [None] * 11; percents = [None] * 11; rank = 1; values_top = ''
        for val, cnt in valuecounts.items():
            values[rank] = 'NULL' if pandas.isna(val) else val            
            counts[rank] = cnt
            percents[rank] = round((100*cnt/count), 1)
            rank = rank + 1
        for value in range(1, rank):
            values_top = values_top + f'{str(values[value])} : {str(counts[value])} ({str(percents[value])}%), '
        data_content = data_content.append({
            'TABLE':table, 'COLUMN':column, 'VALUES':count, 'NULLS':nulls, 'NOTNULLS':notnulls , 'DISTINCT':distinct, 'TOPVALUES':values_top}, 
            ignore_index=True)
    return data_content
def get_content_for_tables(data, tables):
    data_content = pandas.DataFrame(columns=['TABLE','COLUMN','VALUES','NULLS','NOTNULLS','DISTINCT','TOPVALUES'])
    tqdm_loop = tqdm(tables, unit=' it')
    for table in tqdm_loop:
        tqdm_loop.set_description_str('C', refresh=True)
        data_content = get_content_for_table(data_content, data, table)
    data_content.index = range(1, data_content.shape[0]+1)
    return data_content

In [9]:
# Explore shape
data_shape_0 = get_shape_for_tables(data_0, tables)
# Explore content
data_content_0 = get_content_for_tables(data_0, tables)

S: 100%|██████████| 2/2 [00:00<00:00, 266.36 it/s]
C: 100%|██████████| 2/2 [00:00<00:00, 61.73 it/s]


In [10]:
# Helpers
def display_single(df, head):
    df_styler = df.head(head).style.set_table_attributes("style='display:inline'")
    df_styler = df_styler.set_properties(subset=['TABLE','COLUMN','TOPVALUES'],**{'text-align':'left'}).set_table_styles([dict(selector='th',props=[('text-align','left')])])
    display_html(f'{df_styler._repr_html_()}', raw=True)
def display_double(df_1, df_2, head):
    df_1_styler = df_1.head(head).style.set_table_attributes("style='display:inline'")
    df_1_styler = df_1_styler.set_properties(subset=['TABLE'],**{'text-align':'left'}).set_table_styles([dict(selector='th',props=[('text-align','left')])])
    df_2_styler = df_2.head(head).style.set_table_attributes("style='display:inline'")
    df_2_styler = df_2_styler.set_properties(subset=['TABLE','COLUMN','TOPVALUES'],**{'text-align':'left'}).set_table_styles([dict(selector='th',props=[('text-align','left')])])
    display_html(f'{df_1_styler._repr_html_()}<div style="display:inline; margin:36px;">{df_2_styler._repr_html_()}', raw=True)
# Show
display_double(data_shape_0, data_content_0, max(data_shape_0.shape[0],data_content_0.shape[0]))

Unnamed: 0,TABLE,ROWS,COLUMNS
1,TF_COMPANIES,200,2
2,TF_TRANSACTIONS,1000,5

Unnamed: 0,TABLE,COLUMN,VALUES,NULLS,NOTNULLS,DISTINCT,TOPVALUES
1,TF_COMPANIES,COMPANYID,200,0,200,200,"C310023084 : 1 (0.5%), C307110381 : 1 (0.5%), C306592341 : 1 (0.5%), C306634688 : 1 (0.5%), C306714099 : 1 (0.5%),"
2,TF_COMPANIES,COMPANYNAME,200,0,200,200,"Company #0079 : 1 (0.5%), Company #0016 : 1 (0.5%), Company #0006 : 1 (0.5%), Company #0007 : 1 (0.5%), Company #0008 : 1 (0.5%),"
3,TF_TRANSACTIONS,TRANSACTIONID,1000,0,1000,1000,"T643859324 : 1 (0.1%), T643885752 : 1 (0.1%), T643885112 : 1 (0.1%), T643885188 : 1 (0.1%), T643885192 : 1 (0.1%),"
4,TF_TRANSACTIONS,TRANSACTIONDATE,1000,0,1000,31,"01.04.2017 : 41 (4.1%), 12.04.2017 : 41 (4.1%), 23.04.2017 : 40 (4.0%), 08.04.2017 : 38 (3.8%), 04.04.2017 : 37 (3.7%),"
5,TF_TRANSACTIONS,COMPANYFROMID,1000,0,1000,200,"C309440200 : 14 (1.4%), C307110381 : 13 (1.3%), C315346233 : 13 (1.3%), C306634688 : 11 (1.1%), C309113624 : 11 (1.1%),"
6,TF_TRANSACTIONS,COMPANYTOID,1000,0,1000,200,"C315734991 : 10 (1.0%), C306429876 : 9 (0.9%), C313862631 : 9 (0.9%), C307548053 : 9 (0.9%), C307894134 : 9 (0.9%),"
7,TF_TRANSACTIONS,TRANSACTIONAMOUNT,1000,0,1000,995,"5475 : 2 (0.2%), 24014 : 2 (0.2%), 91934 : 2 (0.2%), 52640 : 2 (0.2%), 9048 : 2 (0.2%),"


In [13]:
# Helpers
def save_summary_to_excel(exports, path):
    with pandas.ExcelWriter(path) as writer:
        for key, value in exports.items():
            value.to_excel(writer, sheet_name=key, index=False)

In [14]:
# Initialize
exports = {
    '!': data_shape_0,
    'TF_COMPANIES': data_content_0[data_content_0['TABLE']=='TF_COMPANIES'],
    'TF_TRANSACTIONS': data_content_0[data_content_0['TABLE']=='TF_TRANSACTIONS'] }
# Save summary
save_summary_to_excel(exports, f'{os.getcwd()}/data.0.xlsx')

<div style="margin-top:1px; margin-bottom:1px; padding:1px; background-color:#0f6319; fonf-family:Calibri, sans-serif; font-size:18px; font-weight:bold; color:#ffffff;">
    &nbsp;&sect;&nbsp;&nbsp;Enhance data (with GAN)</div>

In [None]:
# Helpers
@run_timer_d
def get_enhanced_data(label, df, rows):
    rows = rows - df.shape[0]
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df)
    synthesizer = CTGANSynthesizer(metadata, epochs=100)
    synthesizer.fit(df)
    df_new = synthesizer.sample(num_rows=rows)
    df_enh = pandas.concat([df, df_new])
    df_enh.index = range(1, df_enh.shape[0]+1)
    df_enh['TRANSACTIONID'] = 'T6438' + (df_enh.index).astype(str).str.zfill(6)
    return df_enh

In [None]:
# Initialize
data_1 = {}
# Run
data_1['TF_COMPANIES'] = data_0['TF_COMPANIES']
data_1['TF_TRANSACTIONS'] = get_enhanced_data('TF_TRANSACTIONS', data_0['TF_TRANSACTIONS'], 10000)

<div style="margin-top:1px; margin-bottom:1px; padding:1px; background-color:#0f6319; fonf-family:Calibri, sans-serif; font-size:18px; font-weight:bold; color:#ffffff;">
    &nbsp;&sect;&nbsp;&nbsp;Explore data (post-)</div>

In [None]:
# Explore shape
data_shape_1 = get_shape_for_tables(data_1, tables)
# Explore content
data_content_1 = get_content_for_tables(data_1, tables)

In [None]:
# Helpers
def display_single(df, head):
    df_styler = df.head(head).style.set_table_attributes("style='display:inline'")
    df_styler = df_styler.set_properties(subset=['TABLE','COLUMN','TOPVALUES'],**{'text-align':'left'}).set_table_styles([dict(selector='th',props=[('text-align','left')])])
    display_html(f'{df_styler._repr_html_()}', raw=True)
def display_double(df_1, df_2, head):
    df_1_styler = df_1.head(head).style.set_table_attributes("style='display:inline'")
    df_1_styler = df_1_styler.set_properties(subset=['TABLE'],**{'text-align':'left'}).set_table_styles([dict(selector='th',props=[('text-align','left')])])
    df_2_styler = df_2.head(head).style.set_table_attributes("style='display:inline'")
    df_2_styler = df_2_styler.set_properties(subset=['TABLE','COLUMN','TOPVALUES'],**{'text-align':'left'}).set_table_styles([dict(selector='th',props=[('text-align','left')])])
    display_html(f'{df_1_styler._repr_html_()}<div style="display:inline; margin:36px;">{df_2_styler._repr_html_()}', raw=True)
# Show
display_double(data_shape_1, data_content_1, max(data_shape_1.shape[0],data_content_1.shape[0]))

In [None]:
# Initialize
exports = {
    '!': data_shape_1,
    'TF_COMPANIES': data_content_1[data_content_1['TABLE']=='TF_COMPANIES'],
    'TF_TRANSACTIONS': data_content_1[data_content_1['TABLE']=='TF_TRANSACTIONS'] }
# Save summary
save_summary_to_excel(exports, f'{os.getcwd()}/data.1.xlsx')

<div style="margin-top:1px; margin-bottom:1px; padding:1px; background-color:#0f6319; fonf-family:Calibri, sans-serif; font-size:18px; font-weight:bold; color:#ffffff;">
    &nbsp;&sect;&nbsp;&nbsp;Save data (to database)</div>

In [None]:
@run_timer_s
def save_table_to_db(label, data, table, connection):
    data.ads.to_sql(table, connection_parameters=connection, if_exists="replace")
def get_tables_to_db_s(label, data, tables, connection):
    for table in tables:
        save_table_to_db(data[table], data, table, connection)
@run_timer_d
def save_tables_to_db_d(label, data, tables, connection):
    for table in tables:
        data[table].ads.to_sql(table, connection_parameters=connection)

Running check_compute_performance(VM.Standard2.16) ...            16.9 s


In [None]:
save_table_to_db('TF_TRANSACTIONS', data_1['TF_TRANSACTIONS'], 'TF_TRANSACTIONS', connection)

<div style="margin-top:1px; margin-bottom:1px; padding:1px; background-color:#0f6319; fonf-family:Calibri, sans-serif; font-size:18px; font-weight:bold; color:#ffffff;">
    &nbsp;&sect;&nbsp;&nbsp;WIP</div>