In [12]:
!source env/bin/activate

In [None]:
!pip install clickhouse_connect

In [None]:
!pip install numpy

#### Вспомогательные функции для обработки строк

In [261]:
#helper functions

def get_type_of_string_value(str_value, is_nullable):
    if str_value == '':
        return None
    try:
        int(str_value)
        return 'Nullable(Int)' if is_nullable else 'Int'
    except ValueError:
        pass
    try:
        float(str_value)
        return 'Nullable(Float)' if is_nullable else 'Int'
    except ValueError:
        pass
    return 'Nullable(String)' if is_nullable else 'Int'


def get_value_or_null(value, expected_type):
    if value == '':
        return 'Null'
    elif expected_type in (int, float):
        try:
            expected_type(value)
            return str(value)
        except ValueError:
            return 'Null'
    elif expected_type == str:
        return "'" + value + "'"


def remove_first_escape(str_value):
    if str_value.startswith('\n'):
        return str_value[1:]
    else:
        return str_value

    
def convert_list_to_dict(lst):
    res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}
    return res_dct


##### *__get_create_query_based_on_csv_file__* возвращает  sql-запрос в виде строки на создание таблицы
##### *__get_column_types_from_csv__* возвращает словарь {имя колонки: тип колонки, ...}, основанный на csv файле

In [268]:
def get_column_types_from_csv(path_to_csv_file, order_column_names):
    dict_of_column_types = dict()
    with open(path_to_csv_file, 'r') as file:
        for row in csv.DictReader(file):
            dict_of_column_types.update({column_name: get_type_of_string_value(value, is_nullable=(column_name not in order_column_names))
                                            for column_name, value in row.items()
                                            if column_name not in dict_of_column_types or value is not None})
            if None not in dict_of_column_types.values():
                break
    return dict_of_column_types


def get_create_query_based_on_csv_file(path_to_csv_file, table_name_to_create, order_column_names):
    dict_of_column_types = get_column_types_from_csv(path_to_csv_file, order_column_names)
    string_of_column_names_and_types = ", ".join(['`' + column_name + '` ' + column_type
                                                      for column_name, column_type in dict_of_column_types.items()])
    return ('''CREATE TABLE IF NOT EXISTS {} ({}) ENGINE = MergeTree ORDER BY ({});'''
        .format(table_name_to_create,
                string_of_column_names_and_types,
                ', '.join(list(map(lambda x: '`' + x + '`', order_column_names)))))


##### *__get_column_types_from_table__* возвращает словарь {имя колонки: тип колонки, ...}, основанный таблице
##### *__get_insert_query__* возвращает sql-запрос в виде строки на вставку строки таблицы

In [263]:
type_mapper = {'Int32': int,
               'Float32': float,
               'String': str,
               'Nullable(Int32)': int,
               'Nullable(Float32)': float,
               'Nullable(String)': str}


def get_column_types_from_table(client, table_name):
    query_result = client.command('DESCRIBE TABLE {};'.format(table_name))
    query_result_adapted = map(remove_first_escape, (filter(lambda x : x != '', query_result)))
    list_of_column_names_and_types = list(map(lambda x: type_mapper.get(x, x), query_result_adapted))
    return convert_list_to_dict(list_of_column_names_and_types)


def get_insert_query(table_name, row, column_types):
    row_values = ", ".join([get_value_or_null(column_value, column_types[column_name]) for column_name, column_value in row.items()])
    return '''INSERT INTO {} (*) VALUES ({});'''.format(table_name, row_values)


##### *__do_insert__* осуществляет вставку строк из csv в таблицу

In [264]:
def do_insert(client, path_to_csv_file, table_name):
    column_types = get_column_types_from_table(client, table_name)
    with open(path_to_csv_file, 'r') as file:
        for row in csv.DictReader(file):
            insert_query = get_insert_query(table_name, row, column_types)
            client.command(insert_query)

#### __Скрипт на создание таблицы и её заполнение__

In [270]:
import clickhouse_connect


client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123, 
                                       database='datasets')

client.command(get_create_query_based_on_csv_file('/home/alexander/honda_sell_data.csv',
                                                 'maindb.ad_honda_sell_data',
                                                 ['Year']))

do_insert(client, '/home/alexander/honda_sell_data.csv', 'maindb.ad_honda_sell_data')

print('Script completed')


Script completed
