In [None]:
from pyspark.sql import SparkSession
import pandas as pd

In [None]:
spark = SparkSession.builder \
.master('local[*]') \
.appName('test_streaming') \
.config("spark.sql.streaming.SchemaInference", "true") \
.config("spark.executor.memory","3g") \
.config("spark.executor.cores","4") \
.config("spark.cores.max", "2") \
.enableHiveSupport() \
.getOrCreate()

In [None]:
spark

In [None]:
df = spark.read.option("header", "true").csv('data/atictivity_data/2010-12-01.csv')

In [None]:
df.printSchema()

In [None]:
df.schema

In [None]:
df = pd.read_csv("data/atictivity_data/2010-12-01.csv", nrows=10)

In [None]:
df

In [None]:
df_spark = spark.createDataFrame(df)

In [None]:
df_spark.show()

In [None]:
df_spark.printSchema()

In [None]:
df_spark.schema

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [None]:
schema = StructType([
           StructField('InvoiceNo',IntegerType(), True), 
           StructField('StockCode',StringType(), True),
           StructField('Description',StringType(), True),
           StructField('Quantity',IntegerType(), True),
           StructField('InvoiceDate',StringType(), True), 
           StructField('UnitPrice',DoubleType(), True),
           StructField('CustomerID',DoubleType(), True),
           StructField('Country',StringType(), True)
          ])

In [None]:
df = spark.read.option("header", "true").schema(schema).csv('data/atictivity_data/2010-12-01.csv')

In [None]:
df.printSchema()

In [None]:
static = spark.readStream\
              .schema(schema) \
              .option("maxFilesPerTrigger", 1) \
              .csv("data/atictivity_data/")

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [None]:
groupby_customerid = static.groupBy('CustomerID').count()

In [None]:
activity_query = groupby_customerid \
                .writeStream \
                .queryName("activity_counts") \
                .format('memory') \
                .outputMode("complete") \
                .start()

In [None]:
spark.streams.active

In [None]:
from time import sleep

In [None]:
for x in range(5):
    spark.sql("SELECT * FROM activity_counts ").show()
    sleep(1)

In [None]:
activity_query.awaitTermination()