# Spark Practise 

## Import libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
import os
import socket


aws_access_key = "your_access_key"
aws_secret_key = "your_secret_key"

APACHE_MASTER_IP = socket.gethostbyname("apache-spark-master-0.apache-spark-headless.apache-spark.svc.cluster.local")
APACHE_MASTER_URL = f"spark://{APACHE_MASTER_IP}:7077"
POD_IP = os.environ["MY_POD_IP"]
SPARK_APP_NAME = f"spark-{os.environ['HOSTNAME']}"
JARS = """/nfs/env/lib/python3.8/site-packages/pyspark/jars/clickhouse-native-jdbc-shaded-2.6.5.jar, 
/nfs/env/lib/python3.8/site-packages/pyspark/jars/hadoop-aws-3.3.4.jar,
/nfs/env/lib/python3.8/site-packages/pyspark/jars/aws-java-sdk-bundle-1.12.433.jar
"""

MEM = "512m"
CORES = 1
 
spark = SparkSession.\
        builder.\
        appName(SPARK_APP_NAME).\
        master(APACHE_MASTER_URL).\
        config("spark.executor.memory", MEM).\
        config("spark.jars", JARS).\
        config("spark.executor.cores", CORES).\
        config("spark.driver.host", POD_IP).\
        config("spark.hadoop.fs.s3a.access.key", aws_access_key). \
        config("spark.hadoop.fs.s3a.secret.key", aws_secret_key). \
        config("fs.s3a.endpoint", "https://storage.yandexcloud.net").  \
        config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"). \
        config("spark.hadoop.fs.s3a.path.style.access", True). \
        config("spark.hadoop.fs.s3a.committer.name", "directory"). \
        config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"). \
        getOrCreate()



23/09/09 12:00:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Task 1

Find the top 3 sellers with the highest daily goals, and then to calculate their share of the total sales made by all sellers. The share should be calculated as a percentage.

### Solution

Reading data.

In [3]:
sellers_df = spark.read.parquet('s3a://kc-hardda-projects/shared/sellers.parquet')

23/09/09 12:00:19 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

Checking data. 

In [4]:
sellers_df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---------+-----------+------------+
|seller_id|seller_name|daily_target|
+---------+-----------+------------+
|        0|   seller_0|        2500|
|        1|   seller_1|       16451|
|        2|   seller_2|        2855|
|        3|   seller_3|       19103|
|        4|   seller_4|        8820|
|        5|   seller_5|       14894|
|        6|   seller_6|        7928|
|        7|   seller_7|       17022|
|        8|   seller_8|       19924|
|        9|   seller_9|        6496|
+---------+-----------+------------+



                                                                                

Calculating the total over daily_target column. 

In [5]:
total_target = sellers_df.agg({'daily_target': 'sum'}).collect()[0][0]
total_target

                                                                                

115993

Calculating a seller share of the total sales made by all sallers and showing the top-3 list. 

In [6]:
result_df = sellers_df.withColumn('daily_target_percentage',
                      sellers_df['daily_target'] / total_target) \
    .orderBy('daily_target_percentage', ascending=False).limit(3)

result_df.show()

[Stage 5:>                                                          (0 + 1) / 1]

+---------+-----------+------------+-----------------------+
|seller_id|seller_name|daily_target|daily_target_percentage|
+---------+-----------+------------+-----------------------+
|        8|   seller_8|       19924|    0.17176898605950358|
|        3|   seller_3|       19103|    0.16469097273111308|
|        7|   seller_7|       17022|     0.1467502349279698|
+---------+-----------+------------+-----------------------+



                                                                                

Saving results to a parquet file. 

In [7]:
#pip install pyarrow

In [8]:
fin_df = result_df.toPandas()
fin_df.to_parquet('task_1_result.parquet')

                                                                                

## Task 2

Find top 5 days with the largest amount of sales. 

### Solution

Reading data. 

In [9]:
sales_df = spark.read.parquet('s3a://kc-hardda-projects/shared/sales.parquet')

Checking data. 

In [10]:
sales_df.show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2023-07-02|              7|itgntlqljxnCgkeax...|
|       2|         0|        0|2023-07-05|              9|ymkndxcgbyyqugbgp...|
|       3|         0|        0|2023-07-05|              7|qwbfznfqwfgkuXrnn...|
|       4|         0|        0|2023-07-03|              5|tybsrjpfucpitwqxk...|
|       5|         0|        0|2023-07-04|              3|crrjscajxlmugzmyg...|
|       6|         0|        0|2023-07-03|              1|blxycrmmzmaiarooj...|
|       7|         0|        0|2023-07-09|              8|ymidneomjcfnircqz...|
|       8|         0|        0|2023-07-02|              5|euavsmtstttcjxipl...|
|       9|         0|        0|2023-07-01|              7|jpwsylnygmxdfnswc...|
|      10|         0|        0|2023-07-0

                                                                                

Let's work only with columns that we need. 

In [11]:
sales_df_filt = sales_df[['date', 'num_pieces_sold']]
sales_df_filt.show()

[Stage 9:>                                                          (0 + 1) / 1]

+----------+---------------+
|      date|num_pieces_sold|
+----------+---------------+
|2023-07-02|              7|
|2023-07-05|              9|
|2023-07-05|              7|
|2023-07-03|              5|
|2023-07-04|              3|
|2023-07-03|              1|
|2023-07-09|              8|
|2023-07-02|              5|
|2023-07-01|              7|
|2023-07-05|              3|
|2023-07-03|              4|
|2023-07-10|              4|
|2023-07-04|              6|
|2023-07-08|              5|
|2023-07-03|              9|
|2023-07-06|              2|
|2023-07-07|              9|
|2023-07-09|              5|
|2023-07-02|              2|
|2023-07-04|              4|
+----------+---------------+
only showing top 20 rows



                                                                                

Finding top-5 days with the most total units sold. 

In [12]:
result_df = sales_df_filt.groupBy('date').agg({'num_pieces_sold': 'sum'}) \
    .select('date', col('sum(num_pieces_sold)').alias('total_units_sold')) \
    .orderBy('total_units_sold', ascending=False).limit(5)

result_df.show()



+----------+----------------+
|      date|total_units_sold|
+----------+----------------+
|2023-07-07|         1104297|
|2023-07-01|         1102564|
|2023-07-04|         1102133|
|2023-07-02|         1101015|
|2023-07-10|         1100401|
+----------+----------------+



                                                                                

Saving results to a parquet file.

In [13]:
result_df.toPandas().to_parquet('task_2_result.parquet')

                                                                                

Stopping Spark.

In [None]:
spark.stop()