# DWH Project

Creating connection to S3, initializing Spark session and reading data. 

In [1]:
#Spark connection with S3 options
import os
import socket
from pyspark.sql import SparkSession


# credentials to work with S3
aws_access_key = "key"
aws_secret_key = "secrete_key"
s3_bucket = "kc-hardda-projects"
s3_endpoint_url = "https://storage.yandexcloud.net"

# spark session
APACHE_MASTER_IP = socket.gethostbyname("apache-spark-master-0.apache-spark-headless.apache-spark.svc.cluster.local")
APACHE_MASTER_URL = f"spark://{APACHE_MASTER_IP}:7077"
POD_IP = os.environ["MY_POD_IP"]
SPARK_APP_NAME = f"spark-{os.environ['HOSTNAME']}"
JARS = """/nfs/env/lib/python3.8/site-packages/pyspark/jars/clickhouse-native-jdbc-shaded-2.6.5.jar, 
/nfs/env/lib/python3.8/site-packages/pyspark/jars/hadoop-aws-3.3.4.jar,
/nfs/env/lib/python3.8/site-packages/pyspark/jars/aws-java-sdk-bundle-1.12.433.jar
"""

MEM = "2048m"
CORES = 1
 
spark = SparkSession.\
        builder.\
        appName(SPARK_APP_NAME).\
        master(APACHE_MASTER_URL).\
        config("spark.executor.memory", MEM).\
        config("spark.jars", JARS).\
        config("spark.executor.cores", CORES).\
        config("spark.driver.host", POD_IP).\
        config("spark.hadoop.fs.s3a.access.key", aws_access_key). \
        config("spark.hadoop.fs.s3a.secret.key", aws_secret_key). \
        config("fs.s3a.endpoint", "https://storage.yandexcloud.net").  \
        config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"). \
        config("spark.hadoop.fs.s3a.path.style.access", True). \
        config("spark.hadoop.fs.s3a.committer.name", "directory"). \
        config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"). \
        getOrCreate()


# reading data
df_adverts_all = spark.read.parquet('s3a://kc-hardda-projects/shared/adverts_data.parquet')
df_live_adverts = spark.read.parquet('s3a://kc-hardda-projects/shared/live_adverts.parquet')
df_user_passports = spark.read.parquet('s3a://kc-hardda-projects/shared/user_passports.parquet')

# creating dataframe from our data
df_flat = df_live_adverts.join(df_adverts_all, on=['execution_date', 'advert_id'], how='left') \
                   .join(df_user_passports, df_live_adverts['user_id'] == df_user_passports['global_id'], how='left')



23/09/25 13:12:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/09/25 13:12:54 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

Checking data.

In [2]:
df_flat.limit(5).show()



+-------------------+---------+------+---------+--------+----+-----+--------+------+----+-----+------------+----+-------+---------+--------------+
|     execution_date|advert_id|region|  user_id|platform|mark|model|   price|  year|fuel|color|transmission|body|country|global_id|user_type_name|
+-------------------+---------+------+---------+--------+----+-----+--------+------+----+-----+------------+----+-------+---------+--------------+
|2021-06-26 00:00:00|137514150|  Омск|124207514| unknown|null| null|114000.0|2014.0|null| null|        null|null|   null|124207514|         profi|
|2021-09-20 00:00:00|137514150|  Омск|124207514| unknown|null| null|114000.0|2014.0|null| null|        null|null|   null|124207514|         profi|
|2021-04-03 00:00:00|145314141|   Уфа|124243239| unknown|null| null|    null|  null|null| null|        null|null|   null|124243239|   simple_user|
|2021-06-26 00:00:00|145314141|   Уфа|124243239| unknown|null| null|    null|  null|null| null|        null|null|   nu

                                                                                

Checking number of rows. 

In [3]:
df_flat.count()

                                                                                

23/09/25 13:14:17 WARN MemoryStore: Not enough space to cache broadcast_17 in memory! (computed 304.0 MiB so far)
23/09/25 13:14:17 WARN BlockManager: Persisting block broadcast_17 to disk instead.
23/09/25 13:14:21 WARN MemoryStore: Not enough space to cache broadcast_17 in memory! (computed 304.0 MiB so far)


                                                                                

2771661

Dropping duplicated data, if exists.  

In [4]:
df_flat = df_flat.dropDuplicates(['execution_date', 'advert_id'])

In [5]:
df_flat.count()

                                                                                

2771661

Checking data types of the columns. 

In [6]:
for i in df_flat.dtypes:
    print(i)

('execution_date', 'timestamp')
('advert_id', 'bigint')
('region', 'string')
('user_id', 'bigint')
('platform', 'string')
('mark', 'string')
('model', 'string')
('price', 'double')
('year', 'double')
('fuel', 'string')
('color', 'string')
('transmission', 'string')
('body', 'string')
('country', 'string')
('global_id', 'bigint')
('user_type_name', 'string')


Writing results to S3.

In [7]:
student_directory = 'anikitin8'

df_flat.coalesce(1).write.parquet(f"s3a://kc-hardda-projects/{student_directory}/flat_table/", mode='overwrite')

23/09/25 13:14:46 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

Checking that we have saved data to S3 successfully. 

In [8]:
df_flat_test = spark.read.parquet(f"s3a://kc-hardda-projects/{student_directory}/flat_table/")
df_flat_test.show(3)
df_flat_test.count()

                                                                                

+-------------------+---------+------+---------+--------+----+-----+-----+----+----+-----+------------+----+-------+---------+--------------+
|     execution_date|advert_id|region|  user_id|platform|mark|model|price|year|fuel|color|transmission|body|country|global_id|user_type_name|
+-------------------+---------+------+---------+--------+----+-----+-----+----+----+-----+------------+----+-------+---------+--------------+
|2020-11-12 00:00:00|134709471|  Омск|123482031| desktop|null| null| null|null|null| null|        null|null|   null|123482031|   simple_user|
|2020-11-12 00:00:00|146077599|  Сочи|125256070| desktop|null| null| null|null|null| null|        null|null|   null|125256070|   simple_user|
|2020-11-12 00:00:00|146294791|  Омск|126668838| desktop|null| null| null|null|null| null|        null|null|   null|126668838|         profi|
+-------------------+---------+------+---------+--------+----+-----+-----+----+----+-----+------------+----+-------+---------+--------------+
only s

                                                                                

2771661

Let's check the name of the parquet file. 

In [9]:
import boto3
from botocore.exceptions import NoCredentialsError


s3 = boto3.client('s3',
                  aws_access_key_id=aws_access_key,
                  aws_secret_access_key=aws_secret_key,
                  endpoint_url=s3_endpoint_url)

response = s3.list_objects_v2(Bucket=s3_bucket, Prefix = student_directory)

if 'Contents' in response:
    print(f"Objects in bucket '{s3_bucket}':")
    for obj in response['Contents']:
        print(f"- {obj['Key']}")
        path = obj['Key']
else:
    print(f"No objects found in bucket {s3_bucket}")

Objects in bucket 'kc-hardda-projects':
- anikitin8/flat_table/_SUCCESS
- anikitin8/flat_table/part-00000-2afea7e5-9c7e-4b3d-a11d-c4416a4b8c76-c000.snappy.parquet


In [10]:
response = s3.head_object(Bucket=s3_bucket, Key=f'{path}')

print(f'File size: {round(response["ContentLength"] / 1024 / 1024, 2)} Mb')

File size: 31.91 Mb


Creating connection to ClickHouse database.

In [11]:
!pip install clickhouse_driver



In [12]:
from clickhouse_driver import Client


user_name = 'user_name'
pwd = 'password'

# creating connection ClickHouse
client = Client(host='clickhouse.lab.karpov.courses', port=9000,
                user=user_name, password=pwd, database='hardda_student_data')

# checking connection
result = client.execute("SELECT * FROM hardda.user_dm_events LIMIT 10")

# showing the result
for row in result[0:1]:
    print(row)

(datetime.date(2022, 2, 1), datetime.date(2022, 1, 31), 'android', 'f7411212fd0e2523e126cbfdd3f226c211212', '4beb10e1-aeeb-4c52-acd2-ce1ddbc1fc24b10e1', 22, 11, 3, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0)


Creating ClickHouse table using S3 engine. 

In [13]:
query = '''
DROP TABLE IF EXISTS hardda_student_data.prj_s3_ext_nav2103
'''

In [14]:
client.execute(query)

[]

In [15]:
query = '''
CREATE TABLE hardda_student_data.prj_s3_ext_nav2103 (
    `execution_date` Date,
    `advert_id` UInt64,
    `region` LowCardinality(String),
    `user_id` UInt64,
    `platform` LowCardinality(String),
    `mark` Nullable(String),
    `model` Nullable(String),
    `price` Nullable(Float64),
    `year` Nullable(UInt16),
    `fuel` LowCardinality(Nullable(String)),
    `color` LowCardinality(Nullable(String)),
    `transmission` LowCardinality(Nullable(String)), 
    `body` LowCardinality(Nullable(String)), 
    `country` LowCardinality(Nullable(String)),
    `global_id` Nullable(UInt64),
    `user_type_name` LowCardinality(Nullable(String))
)
ENGINE = S3 (
    'https://storage.yandexcloud.net/kc-hardda-projects/{}',
    'key', 
    'secrete_key',
    'Parquet'
    )
PARTITION BY toStartOfMonth(execution_date) 
ORDER BY (execution_date, advert_id)
PRIMARY KEY (execution_date, advert_id)
'''.format(path)

In [16]:
result = client.execute(query)

Creating ClickHouse table using MergeTree engine.

In [17]:
query = '''
DROP TABLE IF EXISTS hardda_student_data.prj_main_nav2103
'''

In [18]:
client.execute(query)

[]

In [19]:
query = '''
CREATE TABLE hardda_student_data.prj_main_nav2103 (
    `execution_date` Date,
    `advert_id` UInt64,
    `region` LowCardinality(String),
    `user_id` UInt64,
    `platform` LowCardinality(String),
    `mark` Nullable(String),
    `model` Nullable(String),
    `price` Nullable(Float64),
    `year` Nullable(UInt16),
    `fuel` LowCardinality(Nullable(String)),
    `color` LowCardinality(Nullable(String)),
    `transmission` LowCardinality(Nullable(String)), 
    `body` LowCardinality(Nullable(String)), 
    `country` LowCardinality(Nullable(String)),
    `global_id` Nullable(UInt64),
    `user_type_name` LowCardinality(Nullable(String))
)
ENGINE = MergeTree()
PARTITION BY toStartOfMonth(execution_date) 
ORDER BY (execution_date, advert_id)
PRIMARY KEY (execution_date, advert_id)
SETTINGS index_granularity = 8192
'''

In [20]:
result = client.execute(query)

In [21]:
query = '''
INSERT INTO hardda_student_data.prj_main_nav2103
SELECT
  *
FROM 
  hardda_student_data.prj_s3_ext_nav2103 
'''

In [22]:
result = client.execute(query)

Checking the result.

In [23]:
query = '''
select * from prj_main_nav2103 limit(2)
'''

result = client.execute(query)

print(*result, sep='\n')

(datetime.date(2020, 11, 12), 134709471, 'Омск', 123482031, 'desktop', None, None, None, None, None, None, None, None, None, 123482031, 'simple_user')
(datetime.date(2020, 11, 12), 136066077, 'Омск', 124162590, 'desktop', None, None, None, None, None, None, None, None, None, 124162590, 'simple_user')


Creating materialized view _agg1. 

In [None]:
tbc..