In [1]:
# Генератор данных
import pandas as pd
import numpy as np

pd.DataFrame(
    zip(np.random.randint(0, 2000, size=100000), 
        np.random.randint(1562007679, 1564606800, size=100000)), 
    columns=['id','timestamp'] 
).to_csv('data/1-logs.csv', index=None)

In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder \
      .master("local[2]") \
      .appName("SparkFirst")   \
      .config("spark.executor.memory", "16g") \
      .config("spark.driver.memory", "16g") \
      .config("spark.executor.cores", 4) \
      .config("spark.dynamicAllocation.enabled", "true") \
      .config("spark.dynamicAllocation.maxExecutors", 4) \
      .config("spark.shuffle.service.enabled", "true") \
      .getOrCreate()
df = spark.read.option("header", True).option("inferSchema", "true").csv("data/1-logs.csv")

In [4]:
wnd  = Window.partitionBy("id", from_unixtime(col("timestamp"), "yyyy-MM-dd"))

df.select("id", from_unixtime(col("timestamp"), "yyyy-MM-dd").alias("date"), \
    (max("timestamp").over(wnd) - min("timestamp").over(wnd)).alias("s_len")) \
    .distinct() \
    .orderBy("id", "date") \
    .groupBy("id") \
    .agg(round((avg("s_len") / 3600), 2).alias("avg_s_len_hours")) \
    .orderBy("id") \
    .show()

+---+---------------+
| id|avg_s_len_hours|
+---+---------------+
|  0|           3.62|
|  1|           6.11|
|  2|           5.84|
|  3|            5.4|
|  4|           6.81|
|  5|           8.17|
|  6|           8.89|
|  7|           4.89|
|  8|            8.7|
|  9|           5.38|
| 10|           4.94|
| 11|           3.64|
| 12|           6.72|
| 13|           8.58|
| 14|            4.8|
| 15|           6.22|
| 16|            7.7|
| 17|           8.63|
| 18|           8.05|
| 19|           9.35|
+---+---------------+
only showing top 20 rows

