## Test generator

In [7]:
import requests
import json
import pandas as pd

headers = {"Content-type": "application/json", "Accept": "text/plain"}
url = "http://172.18.0.100:8000/api/v1/event/"

data = {"interval_start": "14/11/2023 18:00:00", "interval_mins": 5, "trx_count": 10000}

r = requests.post(url, data=json.dumps(data), headers=headers)
status = r.status_code
result = r.json()

df = pd.DataFrame(result['Records'])
#df.head()
df.to_csv('../data/test.csv', encoding='utf-8', index=False)

## Spark session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("spark://172.18.0.2:7077") \
    .appName("Analytics") \
    .getOrCreate()

In [2]:
df =  spark.read.format("csv").load("/data/test.csv")

In [3]:
df.head()

Row(_c0='timestamp', _c1='type', _c2='appName', _c3='appInstance', _c4='appID', _c5='probeID', _c6='eventID', _c7='correletionID', _c8='transactionStart', _c9='transactionEnd', _c10='transactionDuration', _c11='clientIPAddress', _c12='clientPort', _c13='serverIPAddress', _c14='serverPort', _c15='ipProtocol', _c16='category', _c17='bytesFromClient', _c18='bytesToClient', _c19='bytesFromServer', _c20='bytesToServer', _c21='subscriberID', _c22='applicationProtocol', _c23='applicationName', _c24='domain', _c25='deviceType', _c26='networkType', _c27='contentType', _c28='lostBytesClient', _c29='lostBytesServer', _c30='srttMsClient', _c31='srttMsServer')

In [4]:
df.createOrReplaceTempView("events")

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark = SparkSession \
    .builder \
    .master("spark://spark-master:7077") \
    .appName("events") \
    .getOrCreate()

 
df = spark.read.options(header='true', inferschema='true').csv('/data/test.csv')

In [7]:
df.createOrReplaceTempView("events")

In [15]:
df_subscribers = spark.sql(""" SELECT subscriberID, count(*) FROM events GROUP BY subscriberID ORDER BY 2 DESC""")

In [16]:
df_subscribers.show()

+-------------+--------+
| subscriberID|count(1)|
+-------------+--------+
|2015050628968|       9|
|2015050504796|       7|
|2015050640060|       6|
|2015050320210|       6|
|2015050417451|       6|
|2015050379007|       6|
|2015050557041|       5|
|2015050920091|       5|
|2015050813674|       5|
|2015050229029|       5|
|2015050466436|       5|
|2015050839006|       5|
|2015050541665|       5|
|2015050291965|       5|
|2015050725509|       5|
|2015050108642|       5|
|2015050738174|       5|
|2015050160983|       5|
|2015050118466|       5|
|2015050504473|       5|
+-------------+--------+
only showing top 20 rows



In [None]:
df_locations = spark.sql(""" SELECT subscriberID, count(*) FROM events GROUP BY subscriberID ORDER BY 2 DESC""")