#### Вспомогательные функции для обработки строк

In [1]:
#helper functions

def get_type_of_string_value(str_value, is_nullable):
    if str_value == '':
        return None
    try:
        int(str_value)
        return 'Nullable(Int)' if is_nullable else 'Int'
    except ValueError:
        pass
    try:
        float(str_value)
        return 'Nullable(Float)' if is_nullable else 'Int'
    except ValueError:
        pass
    return 'Nullable(String)' if is_nullable else 'Int'


def get_value_or_null(value, expected_type):
    if value == '':
        return 'Null'
    elif expected_type in (int, float):
        try:
            expected_type(value)
            return str(value)
        except ValueError:
            return 'Null'
    elif expected_type == str:
        return "'" + value + "'"


def remove_first_escape(str_value):
    if str_value.startswith('\n'):
        return str_value[1:]
    else:
        return str_value

    
def convert_list_to_dict(lst):
    res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}
    return res_dct


#### Ручные корретировки (все строки с колонкой Seller-Type = 'Individual' внесены некорректно)

In [3]:
def apply_manual_adj_with_honda_dataset(row: dict):
    if row.get('Seller_Type', None) == 'Individual':
        row['Mileage'] = int(row['VIN'].split(' ')[0].replace(',', ''))
        row['VIN'] = row['Engine']
        row['Engine'] = row['Transmission']
        row['Transmission'] = row['Fuel_Type']
        row['Stock_#'] = ''


### Декоратор, замеряющий время работы

In [4]:
from functools import wraps
import time


def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        print('-' * 25)
        print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
        print('-' * 25)
        return result
    return timeit_wrapper


##### *__get_column_types_from_csv__* возвращает словарь {имя колонки: тип колонки, ...}, основанный на csv файле
##### *__get_create_query_based_on_csv_file__* возвращает  sql-запрос в виде строки на создание таблицы
##### *__get_delete_query__* возвращает  sql-запрос в виде строки на удаление таблицы

In [5]:
import clickhouse_connect
import csv


def get_column_types_from_csv(path_to_csv_file: str,
                              order_column_names: list) -> dict:
    dict_of_column_types = dict()
    with open(path_to_csv_file, 'r') as file:
        for row in csv.DictReader(file):
            dict_of_column_types.update({column_name: get_type_of_string_value(value, is_nullable=(column_name not in order_column_names))
                                            for column_name, value in row.items()
                                            if column_name not in dict_of_column_types or value is not None})
            if None not in dict_of_column_types.values():
                break
    return dict_of_column_types


def get_create_query_based_on_csv_file(path_to_csv_file: str,
                                       table_name: str,
                                       order_column_names: list,
                                       table_engine='MergeTree',
                                       partition_column_names=[]) -> str:
    dict_of_column_types = get_column_types_from_csv(path_to_csv_file, order_column_names)
    string_of_column_names_and_types = ", ".join(['`' + column_name + '` ' + column_type
                                                    for column_name, column_type in dict_of_column_types.items()])
    order_by_part = '' if len(order_column_names) == 0 else 'ORDER BY ({})'.format(', '.join(list(map(lambda x: '`' + x + '`', order_column_names))))
    partition_by_part = '' if len(partition_column_names) == 0 else 'PARTITION BY ({})'.format(', '.join(partition_column_names))
    return ('''CREATE TABLE IF NOT EXISTS {} ({}) ENGINE = {} {} {};'''
        .format(table_name,
                string_of_column_names_and_types,
                table_engine,
                partition_by_part,
                order_by_part))


def get_delete_query(table_name: str) -> str:
    return "DROP TABLE IF EXISTS {};".format(table_name)


##### *__get_column_types_from_table__* возвращает словарь {имя колонки: тип колонки, ...}, основанный таблице

In [6]:
type_mapper = {
               'Int32': int,
               'Float32': float,
               'String': str,
               'Nullable(Int32)': int,
               'Nullable(Float32)': float,
               'Nullable(String)': str
              }


def get_column_types_from_table(client: clickhouse_connect.driver.httpclient.HttpClient,
                                table_name: str) -> dict:
    query_result = client.command('DESCRIBE TABLE {};'.format(table_name))
    query_result_adapted = map(remove_first_escape, (filter(lambda x : x != '', query_result)))
    list_of_column_names_and_types = list(map(lambda x: type_mapper.get(x, x), query_result_adapted))
    return convert_list_to_dict(list_of_column_names_and_types)


##### *__do_insert__* осуществляет вставку строк из csv в таблицу

In [15]:
import pandas as pd
from string import Template
from overrides import override
from abc import ABC, abstractmethod, ABCMeta
from typing import Callable


class InserterInterface(metaclass=ABCMeta):
    insert_query_template = Template('INSERT INTO $table_name (*) VALUES ')
    
    @staticmethod
    @abstractmethod
    def insert(client: clickhouse_connect.driver.httpclient.HttpClient,
               path_to_csv_file: str,
               table_name: str,
               batch_size: int,
               column_types: dict,
               apply_manual_adj: Callable):
        pass


class PandasInserter(InserterInterface):
    @override
    def insert(client, path_to_csv_file, table_name, batch_size, column_types, apply_manual_adj=lambda: None):
        insert_query = InserterInterface.insert_query_template.substitute(table_name=table_name)
        df_iterator = pd.read_csv(path_to_csv_file, chunksize=batch_size, na_filter=False)
        for df in df_iterator:
            row_list = []
            for row_index in range(df.shape[0]):
                row = df.iloc[row_index].to_dict()#.items()
                apply_manual_adj(row)
                row_values = ", ".join([get_value_or_null(str(column_value), column_types[column_name]) for column_name, column_value in row.items()])
                row_list.append('(' + row_values + ')')
            client.command(insert_query + ','.join(row_list))


class SimpleInserter(InserterInterface):
    @override
    def insert(client, path_to_csv_file, table_name, batch_size, column_types, apply_manual_adj=lambda: None):
        insert_query = InserterInterface.insert_query_template.substitute(table_name=table_name)
        with open(path_to_csv_file, 'r') as file:
            row_list = []
            for i, row in enumerate(csv.DictReader(file)):
                apply_manual_adj(row) if apply_manual_adj is not None else None
                row_values = ", ".join([get_value_or_null(column_value, column_types[column_name])
                                        for column_name, column_value in row.items()])
                row_list.append('(' + row_values + ')')
                if (len(row_list) == batch_size):
                    client.command(insert_query + ','.join(row_list))
                    row_list = []
            else:
                if (len(row_list) != 0):
                    client.command(insert_query + ','.join(row_list))
    # @staticmethod
    # def ss():
    #     print('123')

    
    
    
    
    
                
# def insert_without_pandas(client: clickhouse_connect.driver.httpclient.HttpClient,
#                           path_to_csv_file: str,
#                           insert_query: str,
#                           batch_size: int,
#                           column_types: dict,
#                           apply_manual_adj=None):
#     with open(path_to_csv_file, 'r') as file:
#         row_list = []
#         for i, row in enumerate(csv.DictReader(file)):
#             apply_manual_adj(row) if apply_manual_adj is not None else None
#             row_values = ", ".join([get_value_or_null(column_value, column_types[column_name])
#                                     for column_name, column_value in row.items()])
#             row_list.append('(' + row_values + ')')
#             if (len(row_list) == batch_size):
#                 client.command(insert_query + ','.join(row_list))
#                 row_list = []
#         else:
#             if (len(row_list) != 0):
#                 client.command(insert_query + ','.join(row_list))


# def insert_with_pandas(client: clickhouse_connect.driver.httpclient.HttpClient,
#                        path_to_csv_file: str,
#                        insert_query: str,
#                        batch_size: int,
#                        column_types: dict,
#                        apply_manual_adj=None):
#     df_iterator = pd.read_csv(path_to_csv_file, chunksize=batch_size, na_filter=False)
#     for df in df_iterator:
#         row_list = []
#         for row_index in range(df.shape[0]):
#             row = df.iloc[row_index].to_dict()#.items()
#             apply_manual_adj(row) if apply_manual_adj is not None else None
#             row_values = ", ".join([get_value_or_null(str(column_value), column_types[column_name]) for column_name, column_value in row.items()])
#             row_list.append('(' + row_values + ')')
#         client.command(insert_query + ','.join(row_list))
        
        
def do_insert(client: clickhouse_connect.driver.httpclient.HttpClient,
              path_to_csv_file: str,
              table_name: str,
              batch_size: int,
              inserter: InserterInterface,
              apply_manual_adj=lambda: None):
    column_types = get_column_types_from_table(client, table_name)
    inserter.insert(client, path_to_csv_file, table_name, batch_size, column_types, apply_manual_adj)

# def ss(inserter: InserterInterface):
#     inserter.ss()
    
# ss(SimpleInserter)

In [16]:
@timeit
def do_insert_timed(client: clickhouse_connect.driver.httpclient.HttpClient,
                    path_to_csv_file: str,
                    table_name: str,
                    batch_size: int,
                    inserter: InserterInterface,
                    apply_manual_adj=lambda: None):
    do_insert(client, path_to_csv_file, table_name, batch_size, inserter, apply_manual_adj)


##### Обернём процесс создания, заполнения и удаления таблицы в функцию для удобства
##### Измеряем время заполнения таблицы при различных параметрах

In [17]:
def do_create_insert_delete_operation(client: clickhouse_connect.driver.httpclient.HttpClient,
                                      path_to_csv_file: str,
                                      table_name: str,
                                      table_engine: str,
                                      order_column_names: list,
                                      batch_size: int,
                                      inserter: InserterInterface,
                                      apply_manual_adj=lambda: None):
    client.command(get_create_query_based_on_csv_file(path_to_csv_file, table_name, order_column_names, table_engine))
    do_insert_timed(client, path_to_csv_file, table_name, batch_size, inserter, apply_manual_adj)
    client.command(get_delete_query(table_name))


#### __Смотрим движок MergeTree без pandas__

In [18]:
client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123, 
                                       database='datasets')

parametrs = {
              'client': client,
              'path_to_csv_file': '/home/alexander/honda_sell_data.csv',
              'table_name': 'maindb.ad_honda_sell_data_MergeTree',
              'table_engine': 'MergeTree',
              'order_column_names': ['Year'],
              'inserter': SimpleInserter,
              'apply_manual_adj': apply_manual_adj_with_honda_dataset,
             }

do_create_insert_delete_operation(**parametrs, batch_size=1)

do_create_insert_delete_operation(**parametrs, batch_size=1000)

do_create_insert_delete_operation(**parametrs, batch_size=2500)

do_create_insert_delete_operation(**parametrs, batch_size=5000)


print('Script completed')


-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f98732be950>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data_MergeTree', 1, <class '__main__.SimpleInserter'>, <function apply_manual_adj_with_honda_dataset at 0x7f98a9f7bd00>) {} Took 18.6629 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f98732be950>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data_MergeTree', 1000, <class '__main__.SimpleInserter'>, <function apply_manual_adj_with_honda_dataset at 0x7f98a9f7bd00>) {} Took 0.1669 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f98732be950>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data_MergeTree', 2500, <class '__main__.SimpleInserter'>, <function apply_manual_adj_with_ho

#### __Смотрим движок TinyLog без pandas__

In [19]:
client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123, 
                                       database='datasets')

parametrs = {
              'client': client,
              'path_to_csv_file': '/home/alexander/honda_sell_data.csv',
              'table_name': 'maindb.ad_honda_sell_data',
              'table_engine': 'TinyLog',
              'order_column_names': [],
              'pandas_flag': False 
             }

do_create_insert_delete_operation(**parametrs, batch_size=1)

do_create_insert_delete_operation(**parametrs, batch_size=1000)

do_create_insert_delete_operation(**parametrs, batch_size=2500)

do_create_insert_delete_operation(**parametrs, batch_size=5000)

print('Script completed')


-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dce07670>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 1, False) {} Took 38.1536 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dce07670>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 1000, False) {} Took 0.1743 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dce07670>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 2500, False) {} Took 0.1538 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dce07670>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 5000, False) {} Took 0.1452 seconds


#### __Смотрим движок MergeTree c pandas__

In [13]:
client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123, 
                                       database='datasets')

parametrs = {
              'client': client,
              'path_to_csv_file': '/home/alexander/honda_sell_data.csv',
              'table_name': 'maindb.ad_honda_sell_data',
              'table_engine': 'MergeTree',
              'order_column_names': ['Year'],
              'pandas_flag': True 
             }

do_create_insert_delete_operation(**parametrs, batch_size=1)

do_create_insert_delete_operation(**parametrs, batch_size=1000)

do_create_insert_delete_operation(**parametrs, batch_size=2500)

do_create_insert_delete_operation(**parametrs, batch_size=5000)


print('Script completed')


-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dd1de2f0>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 1, True) {} Took 33.8825 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dd1de2f0>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 1000, True) {} Took 1.0608 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dd1de2f0>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 2500, True) {} Took 1.0388 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dd1de2f0>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 5000, True) {} Took 1.0906 seconds
----

#### __Смотрим движок TinyLog c pandas__

In [17]:
client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123, 
                                       database='datasets')

parametrs = {
              'client': client,
              'path_to_csv_file': '/home/alexander/honda_sell_data.csv',
              'table_name': 'maindb.ad_honda_sell_data',
              'table_engine': 'TinyLog',
              'order_column_names': [],
              'pandas_flag': True 
             }

do_create_insert_delete_operation(**parametrs, batch_size=1)

do_create_insert_delete_operation(**parametrs, batch_size=1000)

do_create_insert_delete_operation(**parametrs, batch_size=2500)

do_create_insert_delete_operation(**parametrs, batch_size=5000)

print('Script completed')


-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f8fb62331f0>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 1) {} Took 39.1875 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f8fb62331f0>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 1000) {} Took 0.1868 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f8fb62331f0>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 2500) {} Took 0.1557 seconds
-------------------------
-------------------------
Function do_insert_timed(<clickhouse_connect.driver.httpclient.HttpClient object at 0x7f8fb62331f0>, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data', 5000) {} Took 0.1446 seconds
-------------------------
Sc

In [20]:
client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123, 
                                       database='datasets')

parametrs1 = {
              'path_to_csv_file': '/home/alexander/honda_sell_data.csv',
              'table_name': 'maindb.ad_honda_sell_data_TinyLog',
              'table_engine': 'TinyLog',
              'order_column_names': [],
             }

parametrs2 = {
              'client': client,
              'path_to_csv_file': '/home/alexander/honda_sell_data.csv',
              'table_name': 'maindb.ad_honda_sell_data_TinyLog',
              'batch_size': 5000,
              'pandas_flag': False
             }

client.command(get_create_query_based_on_csv_file(**parametrs1))

do_insert_timed(**parametrs2)

print('Script completed')


-------------------------
Function do_insert_timed() {'client': <clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dce05510>, 'path_to_csv_file': '/home/alexander/honda_sell_data.csv', 'table_name': 'maindb.ad_honda_sell_data_TinyLog', 'batch_size': 5000, 'pandas_flag': False} Took 0.1516 seconds
-------------------------
Script completed


In [27]:
client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123, 
                                       database='datasets')

parametrs1 = {
              'path_to_csv_file': '/home/alexander/honda_sell_data.csv',
              'table_name': 'maindb.ad_honda_sell_data_MergeTree',
              'table_engine': 'MergeTree',
              'order_column_names': ['Year'],
              'partition_column_names': ['modulo(`Year`, 5)'],
             }

parametrs2 = {
              'client': client,
              'path_to_csv_file': '/home/alexander/honda_sell_data.csv',
              'table_name': 'maindb.ad_honda_sell_data_MergeTree',
              'batch_size': 1000,
              'pandas_flag': False
             }

client.command(get_create_query_based_on_csv_file(**parametrs1))

do_insert_timed(**parametrs2)

print('Script completed')

-------------------------
Function do_insert_timed() {'client': <clickhouse_connect.driver.httpclient.HttpClient object at 0x7f62dce073d0>, 'path_to_csv_file': '/home/alexander/honda_sell_data.csv', 'table_name': 'maindb.ad_honda_sell_data_MergeTree', 'batch_size': 1000, 'pandas_flag': False} Took 0.1784 seconds
-------------------------
Script completed
