# Spark Practise 

## Import libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
import os
import socket


aws_access_key = "your_access_key"
aws_secret_key = "your_secret_key"

APACHE_MASTER_IP = socket.gethostbyname("apache-spark-master-0.apache-spark-headless.apache-spark.svc.cluster.local")
APACHE_MASTER_URL = f"spark://{APACHE_MASTER_IP}:7077"
POD_IP = os.environ["MY_POD_IP"]
SPARK_APP_NAME = f"spark-{os.environ['HOSTNAME']}"
JARS = """/nfs/env/lib/python3.8/site-packages/pyspark/jars/clickhouse-native-jdbc-shaded-2.6.5.jar, 
/nfs/env/lib/python3.8/site-packages/pyspark/jars/hadoop-aws-3.3.4.jar,
/nfs/env/lib/python3.8/site-packages/pyspark/jars/aws-java-sdk-bundle-1.12.433.jar
"""

MEM = "512m"
CORES = 1
 
spark = SparkSession.\
        builder.\
        appName(SPARK_APP_NAME).\
        master(APACHE_MASTER_URL).\
        config("spark.executor.memory", MEM).\
        config("spark.jars", JARS).\
        config("spark.executor.cores", CORES).\
        config("spark.driver.host", POD_IP).\
        config("spark.hadoop.fs.s3a.access.key", aws_access_key). \
        config("spark.hadoop.fs.s3a.secret.key", aws_secret_key). \
        config("fs.s3a.endpoint", "https://storage.yandexcloud.net").  \
        config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"). \
        config("spark.hadoop.fs.s3a.path.style.access", True). \
        config("spark.hadoop.fs.s3a.committer.name", "directory"). \
        config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"). \
        getOrCreate()



23/09/09 13:24:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Task 1

Find the top 3 sellers with the highest daily goals, and then to calculate their share of the total sales made by all sellers. The share should be calculated as a percentage.

### Solution

Reading data.

In [3]:
sellers_df = spark.read.parquet('s3a://kc-hardda-projects/shared/sellers.parquet')

23/09/09 13:24:13 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

Checking data. 

In [4]:
sellers_df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---------+-----------+------------+
|seller_id|seller_name|daily_target|
+---------+-----------+------------+
|        0|   seller_0|        2500|
|        1|   seller_1|       16451|
|        2|   seller_2|        2855|
|        3|   seller_3|       19103|
|        4|   seller_4|        8820|
|        5|   seller_5|       14894|
|        6|   seller_6|        7928|
|        7|   seller_7|       17022|
|        8|   seller_8|       19924|
|        9|   seller_9|        6496|
+---------+-----------+------------+



                                                                                

Calculating the total over daily_target column. 

In [5]:
total_target = sellers_df.agg({'daily_target': 'sum'}).collect()[0][0]
total_target

                                                                                

115993

Calculating a seller share of the total sales made by all sallers and showing the top-3 list. 

In [6]:
result_df = sellers_df.withColumn('daily_target_percentage',
                      sellers_df['daily_target'] / total_target) \
    .orderBy('daily_target_percentage', ascending=False).limit(3)

result_df.show()

[Stage 5:>                                                          (0 + 1) / 1]

+---------+-----------+------------+-----------------------+
|seller_id|seller_name|daily_target|daily_target_percentage|
+---------+-----------+------------+-----------------------+
|        8|   seller_8|       19924|    0.17176898605950358|
|        3|   seller_3|       19103|    0.16469097273111308|
|        7|   seller_7|       17022|     0.1467502349279698|
+---------+-----------+------------+-----------------------+



                                                                                

Saving results to a parquet file. 

In [7]:
#pip install pyarrow

In [8]:
fin_df = result_df.toPandas()
fin_df.to_parquet('task_1_result.parquet')

                                                                                

## Task 2

Find top 5 days with the largest amount of sales. 

### Solution

Reading data. 

In [9]:
sales_df = spark.read.parquet('s3a://kc-hardda-projects/shared/sales.parquet')

Checking data. 

In [10]:
sales_df.show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2023-07-02|              7|itgntlqljxnCgkeax...|
|       2|         0|        0|2023-07-05|              9|ymkndxcgbyyqugbgp...|
|       3|         0|        0|2023-07-05|              7|qwbfznfqwfgkuXrnn...|
|       4|         0|        0|2023-07-03|              5|tybsrjpfucpitwqxk...|
|       5|         0|        0|2023-07-04|              3|crrjscajxlmugzmyg...|
|       6|         0|        0|2023-07-03|              1|blxycrmmzmaiarooj...|
|       7|         0|        0|2023-07-09|              8|ymidneomjcfnircqz...|
|       8|         0|        0|2023-07-02|              5|euavsmtstttcjxipl...|
|       9|         0|        0|2023-07-01|              7|jpwsylnygmxdfnswc...|
|      10|         0|        0|2023-07-0

                                                                                

Let's work only with columns that we need. 

In [11]:
sales_df_filt = sales_df[['date', 'num_pieces_sold']]
sales_df_filt.show()

[Stage 9:>                                                          (0 + 1) / 1]

+----------+---------------+
|      date|num_pieces_sold|
+----------+---------------+
|2023-07-02|              7|
|2023-07-05|              9|
|2023-07-05|              7|
|2023-07-03|              5|
|2023-07-04|              3|
|2023-07-03|              1|
|2023-07-09|              8|
|2023-07-02|              5|
|2023-07-01|              7|
|2023-07-05|              3|
|2023-07-03|              4|
|2023-07-10|              4|
|2023-07-04|              6|
|2023-07-08|              5|
|2023-07-03|              9|
|2023-07-06|              2|
|2023-07-07|              9|
|2023-07-09|              5|
|2023-07-02|              2|
|2023-07-04|              4|
+----------+---------------+
only showing top 20 rows



                                                                                

Finding top-5 days with the most total units sold. 

In [12]:
result_df = sales_df_filt.groupBy('date').agg({'num_pieces_sold': 'sum'}) \
    .select('date', col('sum(num_pieces_sold)').alias('total_units_sold')) \
    .orderBy('total_units_sold', ascending=False).limit(5)

result_df.show()



+----------+----------------+
|      date|total_units_sold|
+----------+----------------+
|2023-07-07|         1104297|
|2023-07-01|         1102564|
|2023-07-04|         1102133|
|2023-07-02|         1101015|
|2023-07-10|         1100401|
+----------+----------------+



                                                                                

Saving results to a parquet file.

In [13]:
result_df.toPandas().to_parquet('task_2_result.parquet')

                                                                                

## Task 3

The task is to find the suffix for each product and then calculate the average price for each suffix.  
The results should be sorted by the suffix in ascending order.  
The suffix is defined as the last six characters of the product name.  
The final DataFrame should contain 75,000 rows.

### Solution

Reading data.

In [14]:
products_df = spark.read.parquet('s3a://kc-hardda-projects/shared/products.parquet')

Checking data. 

In [15]:
products_df.show()

[Stage 17:>                                                         (0 + 1) / 1]

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
|         5|   product_5|  128|
|         6|   product_6|   66|
|         7|   product_7|  145|
|         8|   product_8|   51|
|         9|   product_9|   44|
|        10|  product_10|   53|
|        11|  product_11|   13|
|        12|  product_12|  104|
|        13|  product_13|  102|
|        14|  product_14|   24|
|        15|  product_15|   14|
|        16|  product_16|   38|
|        17|  product_17|   72|
|        18|  product_18|   16|
|        19|  product_19|   46|
+----------+------------+-----+
only showing top 20 rows



                                                                                

Creating product_suffix column. 

In [16]:
products_df = products_df.withColumn('product_suffix', substring(col('product_name'), -6, 6))
products_df.show()

[Stage 18:>                                                         (0 + 1) / 1]

+----------+------------+-----+--------------+
|product_id|product_name|price|product_suffix|
+----------+------------+-----+--------------+
|         0|   product_0|   22|        duct_0|
|         1|   product_1|   30|        duct_1|
|         2|   product_2|   91|        duct_2|
|         3|   product_3|   37|        duct_3|
|         4|   product_4|  145|        duct_4|
|         5|   product_5|  128|        duct_5|
|         6|   product_6|   66|        duct_6|
|         7|   product_7|  145|        duct_7|
|         8|   product_8|   51|        duct_8|
|         9|   product_9|   44|        duct_9|
|        10|  product_10|   53|        uct_10|
|        11|  product_11|   13|        uct_11|
|        12|  product_12|  104|        uct_12|
|        13|  product_13|  102|        uct_13|
|        14|  product_14|   24|        uct_14|
|        15|  product_15|   14|        uct_15|
|        16|  product_16|   38|        uct_16|
|        17|  product_17|   72|        uct_17|
|        18| 

                                                                                

Calculating average price for each product_suffix. 

In [17]:
result_df = products_df.select('product_suffix', 'price') \
    .groupBy('product_suffix').agg({'price': 'avg'}) \
    .orderBy('product_suffix') \
    .withColumnRenamed('avg(price)', 'avg_price') \
    .withColumnRenamed('product_suffix', 'name_suffix')

result_df.show()

[Stage 19:>                                                         (0 + 1) / 1]

+-----------+---------+
|name_suffix|avg_price|
+-----------+---------+
|     _10000|    143.0|
|     _10001|    149.0|
|     _10002|     90.0|
|     _10003|    137.0|
|     _10004|     89.0|
|     _10005|     14.0|
|     _10006|    147.0|
|     _10007|     75.0|
|     _10008|    134.0|
|     _10009|     50.0|
|     _10010|    101.0|
|     _10011|    120.0|
|     _10012|      7.0|
|     _10013|     86.0|
|     _10014|     13.0|
|     _10015|    106.0|
|     _10016|     38.0|
|     _10017|     50.0|
|     _10018|     34.0|
|     _10019|     57.0|
+-----------+---------+
only showing top 20 rows



                                                                                

Showing our final dataframe size. 

In [18]:
result_df.count()

                                                                                

75000

Saving results to a parquet file.

In [19]:
result_df.toPandas().to_parquet('task_3_result.parquet')

                                                                                

## Task 4

We need to find the average daily income for each seller and identify the top 3 sellers with the highest average daily income.  
To solve this problem, we will use all three tables.

### Solution

Reading data. 

In [20]:
products_df = spark.read.parquet('s3a://kc-hardda-projects/shared/products.parquet')
sales_df = spark.read.parquet('s3a://kc-hardda-projects/shared/sales.parquet')
sellers_df = spark.read.parquet('s3a://kc-hardda-projects/shared/sellers.parquet')

Checking data.

In [21]:
products_df.limit(5).show()

[Stage 39:>                                                         (0 + 1) / 1]

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
+----------+------------+-----+



                                                                                

In [22]:
sales_df.limit(5).show()



+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2023-07-02|              7|itgntlqljxnCgkeax...|
|       2|         0|        0|2023-07-05|              9|ymkndxcgbyyqugbgp...|
|       3|         0|        0|2023-07-05|              7|qwbfznfqwfgkuXrnn...|
|       4|         0|        0|2023-07-03|              5|tybsrjpfucpitwqxk...|
|       5|         0|        0|2023-07-04|              3|crrjscajxlmugzmyg...|
+--------+----------+---------+----------+---------------+--------------------+



                                                                                

In [23]:
sellers_df.limit(5).show()

[Stage 45:>                                                         (0 + 1) / 1]

+---------+-----------+------------+
|seller_id|seller_name|daily_target|
+---------+-----------+------------+
|        0|   seller_0|        2500|
|        1|   seller_1|       16451|
|        2|   seller_2|        2855|
|        3|   seller_3|       19103|
|        4|   seller_4|        8820|
+---------+-----------+------------+



                                                                                

Let's JOIN products and sales dataframes together. 

In [24]:
products_sales = products_df.select('product_id', 'price') \
    .join(sales_df.select('product_id', 'seller_id', 'date', 'num_pieces_sold'),
          on='product_id', how='inner')
products_sales.show()

[Stage 49:>                                                         (0 + 1) / 1]

+----------+-----+---------+----------+---------------+
|product_id|price|seller_id|      date|num_pieces_sold|
+----------+-----+---------+----------+---------------+
|         0|   22|        0|2023-07-02|              7|
|         0|   22|        0|2023-07-05|              9|
|         0|   22|        0|2023-07-05|              7|
|         0|   22|        0|2023-07-03|              5|
|         0|   22|        0|2023-07-04|              3|
|         0|   22|        0|2023-07-03|              1|
|         0|   22|        0|2023-07-09|              8|
|         0|   22|        0|2023-07-02|              5|
|         0|   22|        0|2023-07-01|              7|
|         0|   22|        0|2023-07-05|              3|
|         0|   22|        0|2023-07-03|              4|
|         0|   22|        0|2023-07-10|              4|
|         0|   22|        0|2023-07-04|              6|
|         0|   22|        0|2023-07-08|              5|
|         0|   22|        0|2023-07-03|         

                                                                                

Calculate revenue for each product. 

In [25]:
products_sales = products_sales.withColumn('revenue', col('price') * col('num_pieces_sold'))
products_sales.show()

[Stage 51:>                                                         (0 + 1) / 1]

+----------+-----+---------+----------+---------------+-------+
|product_id|price|seller_id|      date|num_pieces_sold|revenue|
+----------+-----+---------+----------+---------------+-------+
|         0|   22|        0|2023-07-02|              7|    154|
|         0|   22|        0|2023-07-05|              9|    198|
|         0|   22|        0|2023-07-05|              7|    154|
|         0|   22|        0|2023-07-03|              5|    110|
|         0|   22|        0|2023-07-04|              3|     66|
|         0|   22|        0|2023-07-03|              1|     22|
|         0|   22|        0|2023-07-09|              8|    176|
|         0|   22|        0|2023-07-02|              5|    110|
|         0|   22|        0|2023-07-01|              7|    154|
|         0|   22|        0|2023-07-05|              3|     66|
|         0|   22|        0|2023-07-03|              4|     88|
|         0|   22|        0|2023-07-10|              4|     88|
|         0|   22|        0|2023-07-04| 

                                                                                

Let's add sellers names.

In [26]:
sellers_sales = products_sales.select('seller_id', 'date', 'revenue') \
    .join(sellers_df.select('seller_id', 'seller_name'), on='seller_id', how='inner')
sellers_sales.show()

[Stage 54:>                                                         (0 + 1) / 1]

+---------+----------+-------+-----------+
|seller_id|      date|revenue|seller_name|
+---------+----------+-------+-----------+
|        0|2023-07-02|    154|   seller_0|
|        0|2023-07-05|    198|   seller_0|
|        0|2023-07-05|    154|   seller_0|
|        0|2023-07-03|    110|   seller_0|
|        0|2023-07-04|     66|   seller_0|
|        0|2023-07-03|     22|   seller_0|
|        0|2023-07-09|    176|   seller_0|
|        0|2023-07-02|    110|   seller_0|
|        0|2023-07-01|    154|   seller_0|
|        0|2023-07-05|     66|   seller_0|
|        0|2023-07-03|     88|   seller_0|
|        0|2023-07-10|     88|   seller_0|
|        0|2023-07-04|    132|   seller_0|
|        0|2023-07-08|    110|   seller_0|
|        0|2023-07-03|    198|   seller_0|
|        0|2023-07-06|     44|   seller_0|
|        0|2023-07-07|    198|   seller_0|
|        0|2023-07-09|    110|   seller_0|
|        0|2023-07-02|     44|   seller_0|
|        0|2023-07-04|     88|   seller_0|
+---------+

                                                                                

Calculating daily income for each seller. 

In [27]:
daily_sales = sellers_sales.groupBy(['seller_id', 'seller_name', 'date']) \
    .agg({'revenue': 'sum'}).withColumnRenamed('sum(revenue)', 'sum_revenue')
daily_sales.show()



+---------+-----------+----------+-----------+
|seller_id|seller_name|      date|sum_revenue|
+---------+-----------+----------+-----------+
|        5|   seller_5|2023-07-04|     481266|
|        2|   seller_2|2023-07-03|     445869|
|        9|   seller_9|2023-07-06|     460405|
|        5|   seller_5|2023-07-10|     443163|
|        2|   seller_2|2023-07-08|     452653|
|        8|   seller_8|2023-07-02|     449880|
|        9|   seller_9|2023-07-03|     472991|
|        6|   seller_6|2023-07-08|     469470|
|        2|   seller_2|2023-07-04|     469444|
|        4|   seller_4|2023-07-03|     478116|
|        7|   seller_7|2023-07-02|     454340|
|        5|   seller_5|2023-07-03|     436854|
|        4|   seller_4|2023-07-01|     441512|
|        6|   seller_6|2023-07-03|     490072|
|        8|   seller_8|2023-07-01|     461571|
|        4|   seller_4|2023-07-08|     474407|
|        5|   seller_5|2023-07-02|     451272|
|        9|   seller_9|2023-07-07|     472201|
|        6|  

                                                                                

Calculating average daily sales by a seller and showing top 3 sellers with the highest average daily income.

In [28]:
result_df = daily_sales.groupBy(['seller_id', 'seller_name']) \
    .agg({'sum_revenue': 'mean'}) \
    .withColumnRenamed('avg(sum_revenue)', 'avg_daily_revenue') \
    .orderBy('avg_daily_revenue', ascending=False).limit(3)

result_df.show()



+---------+-----------+-----------------+
|seller_id|seller_name|avg_daily_revenue|
+---------+-----------+-----------------+
|        0|   seller_0|     2.29960082E7|
|        8|   seller_8|         468926.8|
|        3|   seller_3|         466907.3|
+---------+-----------+-----------------+



                                                                                

Saving results to a parquet file.

In [29]:
result_df.toPandas().to_parquet('task_4_result.parquet')

                                                                                

Stopping Spark.

In [30]:
spark.stop()