In [None]:
!pip install clickhouse_connect

In [None]:
!pip install numpy
!pip install pandas

#### Вспомогательные функции для обработки строк

In [1]:
#helper functions

def get_type_of_string_value(str_value, is_nullable):
    if str_value == '':
        return None
    try:
        int(str_value)
        return 'Nullable(Int)' if is_nullable else 'Int'
    except ValueError:
        pass
    try:
        float(str_value)
        return 'Nullable(Float)' if is_nullable else 'Int'
    except ValueError:
        pass
    return 'Nullable(String)' if is_nullable else 'Int'


def get_value_or_null(value, expected_type):
    if value == '':
        return 'Null'
    elif expected_type in (int, float):
        try:
            expected_type(value)
            return str(value)
        except ValueError:
            return 'Null'
    elif expected_type == str:
        return "'" + value + "'"


def remove_first_escape(str_value):
    if str_value.startswith('\n'):
        return str_value[1:]
    else:
        return str_value

    
def convert_list_to_dict(lst):
    res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}
    return res_dct


##### *__get_create_query_based_on_csv_file__* возвращает  sql-запрос в виде строки на создание таблицы
##### *__get_column_types_from_csv__* возвращает словарь {имя колонки: тип колонки, ...}, основанный на csv файле

In [19]:
import csv


def get_column_types_from_csv(path_to_csv_file, order_column_names):
    dict_of_column_types = dict()
    with open(path_to_csv_file, 'r') as file:
        for row in csv.DictReader(file):
            dict_of_column_types.update({column_name: get_type_of_string_value(value, is_nullable=(column_name not in order_column_names))
                                            for column_name, value in row.items()
                                            if column_name not in dict_of_column_types or value is not None})
            if None not in dict_of_column_types.values():
                break
    return dict_of_column_types


def get_create_query_based_on_csv_file(path_to_csv_file,
                                       table_name_to_create,
                                       order_column_names,
                                       table_engine='MergeTree'):
    dict_of_column_types = get_column_types_from_csv(path_to_csv_file, order_column_names)
    string_of_column_names_and_types = ", ".join(['`' + column_name + '` ' + column_type
                                                    for column_name, column_type in dict_of_column_types.items()])
    return ('''CREATE TABLE IF NOT EXISTS {} ({}) ENGINE = {} ORDER BY ({});'''
        .format(table_name_to_create,
                string_of_column_names_and_types,
                table_engine,
                ', '.join(list(map(lambda x: '`' + x + '`', order_column_names)))))


##### *__get_column_types_from_table__* возвращает словарь {имя колонки: тип колонки, ...}, основанный таблице
##### *__do_insert__* осуществляет вставку строк из csv в таблицу

In [25]:
type_mapper = {'Int32': int,
               'Float32': float,
               'String': str,
               'Nullable(Int32)': int,
               'Nullable(Float32)': float,
               'Nullable(String)': str}


def get_column_types_from_table(client, table_name):
    query_result = client.command('DESCRIBE TABLE {};'.format(table_name))
    query_result_adapted = map(remove_first_escape, (filter(lambda x : x != '', query_result)))
    list_of_column_names_and_types = list(map(lambda x: type_mapper.get(x, x), query_result_adapted))
    return convert_list_to_dict(list_of_column_names_and_types)


def do_insert(client, path_to_csv_file, table_name):
    df_iterator = pd.read_csv(path_to_csv_file, chunksize=1000, na_filter=False)
    column_types = get_column_types_from_table(client, table_name)
    for df in df_iterator:
        for row_index in range(df.shape[0]):
            row_values = ", ".join([get_value_or_null(str(column_value), column_types[column_name]) for column_name, column_value in df.iloc[row_index].to_dict().items()])
            insert_query = '''INSERT INTO {} (*) VALUES ({});'''.format(table_name, row_values)
            client.command(insert_query)


#### __Скрипт на создание таблицы и её заполнение__

In [26]:
import clickhouse_connect


client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123, 
                                       database='datasets')

client.command(get_create_query_based_on_csv_file(path_to_csv_file='/home/alexander/honda_sell_data.csv',
                                                 table_name_to_create='maindb.ad_honda_sell_data',
                                                 order_column_names=['Year']))

do_insert(client,
          path_to_csv_file='/home/alexander/honda_sell_data.csv',
          table_name='maindb.ad_honda_sell_data')

print('Script completed')


Script completed


In [24]:
import pandas as pd

df_iterator = pd.read_csv('/home/alexander/honda_sell_data.csv', chunksize=1, na_filter=False)
i = 0
for df in df_iterator:
    for v, g in df.iloc[0].to_dict().items():
        print(v, g, type(g))
    # print(df.iloc[0])
    i += 1
    if i == 2:
        break

Year 2023 <class 'int'>
Make Honda <class 'str'>
Model Ridgeline RTL <class 'str'>
Condition New <class 'str'>
Price $46,370 <class 'str'>
Consumer_Rating 4.8 <class 'float'>
Consumer_Review_# 9 <class 'int'>
Exterior_Color Platinum White Pearl <class 'str'>
Interior_Color Beige <class 'str'>
Drivetrain All-wheel Drive <class 'str'>
MPG  <class 'str'>
Fuel_Type Gasoline <class 'str'>
Transmission Automatic <class 'str'>
Engine 3.5L V6 24V GDI SOHC <class 'str'>
VIN 5FPYK3F58PB011817 <class 'str'>
Stock_# 830164 <class 'int'>
Mileage 10 <class 'int'>
Comfort_Rating 5.0 <class 'float'>
Interior_Design_Rating 4.8 <class 'float'>
Performance_Rating 4.8 <class 'float'>
Value_For_Money_Rating 4.2 <class 'float'>
Exterior_Styling_Rating 5.0 <class 'float'>
Reliability_Rating 5.0 <class 'float'>
State CA <class 'str'>
Seller_Type Dealer <class 'str'>
Year 2023 <class 'int'>
Make Honda <class 'str'>
Model CR-V Hybrid Sport <class 'str'>
Condition New <class 'str'>
Price $34,150 <class 'str'>
Co