## W205 - Fall 2019 - Project 3 - Understanding User Behavior

### Nobu Yamaguchi

#### Background
- I am a data scientist at a game development company. 
- Our latest mobile game has two events we are interested in tracking: `purchase a sword` & `join guild`
- Each has metadata characteristic of such events.

In [3]:
import json

#### Read from kafka

In [3]:
raw_events = spark.read.format("kafka").option("kafka.bootstrap.servers", "kafka:29092").option("subscribe","events").option("startingOffsets", "earliest").option("endingOffsets", "latest").load() 

#### Explore out events

In [4]:
events = raw_events.select(raw_events.value.cast('string'))

In [5]:
extracted_events = events.rdd.map(lambda x: json.loads(x.value)).toDF()



In [6]:
extracted_events.show()

+------+--------------+-----------+--------------+
|Accept|          Host| User-Agent|    event_type|
+------+--------------+-----------+--------------+
|   */*|localhost:5000|curl/7.47.0|       default|
|   */*|localhost:5000|curl/7.47.0|purchase_sword|
|   */*|localhost:5000|curl/7.47.0|purchase_knife|
|   */*|localhost:5000|curl/7.47.0| purchase_frog|
|   */*|localhost:5000|curl/7.47.0|    ride_horse|
|   */*|localhost:5000|curl/7.47.0|climb_mountain|
|   */*|localhost:5000|curl/7.47.0|    ride_horse|
|   */*|localhost:5000|curl/7.47.0| purchase_frog|
+------+--------------+-----------+--------------+



In [7]:
extracted_events \
        .write \
        .parquet("/tmp/extracted_events")

#### Transform events

In [9]:
@udf('string')
def munge_event(event_as_json):
    event = json.loads(event_as_json)
    event['Host'] = "moe"
    event['Cache-Control'] = "no-cache"
    return json.dumps(event)

raw_events = spark \
        .read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:29092") \
        .option("subscribe", "events") \
        .option("startingOffsets", "earliest") \
        .option("endingOffsets", "latest") \
        .load()

munged_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .withColumn('munged', munge_event('raw'))
munged_events.show()

extracted_events = munged_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.munged))) \
    .toDF()
extracted_events.show()

extracted_events \
    .write \
    .mode("overwrite") \
    .parquet("/tmp/extracted_events")

+--------------------+--------------------+--------------------+
|                 raw|           timestamp|              munged|
+--------------------+--------------------+--------------------+
|{"Host": "localho...|2019-11-10 05:04:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:05:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:06:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:07:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:07:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:07:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:41:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:42:...|{"Host": "moe", "...|
+--------------------+--------------------+--------------------+

+------+-------------+----+-----------+--------------+--------------------+
|Accept|Cache-Control|Host| User-Agent|    event_type|           timestamp|
+------+-------------+----+-----------+--------------+-------------

#### Separate events

In [10]:
# separate.py
import json

from pyspark.sql import Row
from pyspark.sql.functions import udf

@udf('string')
def munge_event(event_as_json):
    event = json.loads(event_as_json)
    event['Host'] = "moe"
    event['Cache-Control'] = "no-cache"
    return json.dumps(event)

raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

munged_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .withColumn('munged', munge_event('raw'))

extracted_events = munged_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.munged))) \
    .toDF()

sword_purchases = extracted_events \
    .filter(extracted_events.event_type == 'purchase_sword')
sword_purchases.show()
sword_purchases \
    .write \
    .mode("overwrite") \
    .parquet("/tmp/sword_purchases")
    
default_hits = extracted_events \
    .filter(extracted_events.event_type == 'default')
default_hits.show()
default_hits \
    .write \
    .mode("overwrite") \
    .parquet("/tmp/default_hits")

+------+-------------+----+-----------+--------------+--------------------+
|Accept|Cache-Control|Host| User-Agent|    event_type|           timestamp|
+------+-------------+----+-----------+--------------+--------------------+
|   */*|     no-cache| moe|curl/7.47.0|purchase_sword|2019-11-10 05:05:...|
+------+-------------+----+-----------+--------------+--------------------+

+------+-------------+----+-----------+----------+--------------------+
|Accept|Cache-Control|Host| User-Agent|event_type|           timestamp|
+------+-------------+----+-----------+----------+--------------------+
|   */*|     no-cache| moe|curl/7.47.0|   default|2019-11-10 05:04:...|
+------+-------------+----+-----------+----------+--------------------+



#### Filtered event (purchase sword)

In [1]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_purchase(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False


raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

purchase_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_purchase('raw'))

extracted_purchase_events = purchase_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_purchase_events.printSchema()
extracted_purchase_events.show()

extracted_purchase_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/purchases')


root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   *

#### Queries from Spark
(First, I read parquet from '/tmp/purchases'. Next, I ran the sql to select rows where Host is 'user1.comcast.com'. Finally, I showed the summary of dataframe.)

In [2]:
purchases = spark.read.parquet('/tmp/purchases')

purchases.show()

purchases.registerTempTable('purchases')

purchases_by_example2 = spark.sql("select * from purchases where Host = 'user1.comcast.com'")

purchases_by_example2.show()

df = purchases_by_example2.toPandas()

df.describe()

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_s

Unnamed: 0,Accept,Host,User-Agent,event_type,timestamp
count,10,10,10,10,10
unique,1,1,1,1,10
top,*/*,user1.comcast.com,ApacheBench/2.3,purchase_sword,2019-11-24 18:08:27.796
freq,10,10,10,10,1


#### Re-run the same one

In [1]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_purchase(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False


raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

purchase_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_purchase('raw'))

extracted_purchase_events = purchase_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_purchase_events.printSchema()
extracted_purchase_events.show()

extracted_purchase_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/purchases')

root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   *

#### Schema on definition

In [2]:
df = spark.read.parquet('/tmp/purchases')

df.registerTempTable('purchases')

query = "create external table purchase_events stored as parquet location '/tmp/purchase_events' as select * from purchases"

spark.sql(query)

DataFrame[]

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/
```
```
Found 5 items
drwxrwxrwt   - mapred mapred              0 2016-04-06 02:26 /tmp/hadoop-yarn
drwx-wx-wx   - hive   supergroup          0 2019-12-07 22:55 /tmp/hive
drwxrwxrwt   - mapred hadoop              0 2016-04-06 02:28 /tmp/logs
drwxr-xr-x   - root   supergroup          0 2019-12-07 23:29 /tmp/purchase_events
drwxr-xr-x   - root   supergroup          0 2019-12-07 23:29 /tmp/purchases
```

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/purchases/
```
```
Found 2 items
-rw-r--r--   1 root supergroup          0 2019-12-07 23:29 /tmp/purchases/_SUCCESS
-rw-r--r--   1 root supergroup       1647 2019-12-07 23:29 /tmp/purchases/part-00000-32965ab8-c450-401a-bffe-8977a5723f4a-c000.snappy.parquet
```

#### Q. Would like to see the table of purchase_sword events.

In [6]:
spark.sql("select * from purchases").show()

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_s

In [12]:
spark.sql("select * from purchase_events").show()

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_s

>#### A. See the tables above (from 2 tables). The answer is same.

#### Add another event filter (join_guild)

In [4]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_join(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'join_guild':
        return True
    return False


raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

join_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_join('raw'))

extracted_join_events = join_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_join_events.printSchema()
extracted_join_events.show()

extracted_join_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/joins')

root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|           timestamp|
+------+-----------------+---------------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|joi

In [7]:
df2 = spark.read.parquet('/tmp/joins')

df2.registerTempTable('joins')

query = "create external table join_events stored as parquet location '/tmp/join_events' as select * from joins"

spark.sql(query)

DataFrame[]

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/
```
```
Found 7 items
drwxrwxrwt   - mapred mapred              0 2016-04-06 02:26 /tmp/hadoop-yarn
drwx-wx-wx   - hive   supergroup          0 2019-12-07 22:55 /tmp/hive
drwxr-xr-x   - root   supergroup          0 2019-12-07 23:37 /tmp/join_events
drwxr-xr-x   - root   supergroup          0 2019-12-07 23:34 /tmp/joins
drwxrwxrwt   - mapred hadoop              0 2016-04-06 02:28 /tmp/logs
drwxr-xr-x   - root   supergroup          0 2019-12-07 23:29 /tmp/purchase_events
drwxr-xr-x   - root   supergroup          0 2019-12-07 23:29 /tmp/purchases
```

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/joins/
```
```
Found 2 items
-rw-r--r--   1 root supergroup          0 2019-12-07 23:34 /tmp/joins/_SUCCESS
-rw-r--r--   1 root supergroup       1626 2019-12-07 23:34 /tmp/joins/part-00000-00480990-767d-4aa1-ab33-4f4e46fcbbac-c000.snappy.parquet
```

#### Q. Would like to see the name of the hosts in the join_a_guild events.

In [11]:
spark.sql("select distinct(host) from joins").show()

+-----------------+
|             host|
+-----------------+
|    user2.att.com|
|user1.comcast.com|
+-----------------+



In [13]:
spark.sql("select distinct(host) from join_events").show()

+-----------------+
|             host|
+-----------------+
|    user2.att.com|
|user1.comcast.com|
+-----------------+



>#### A. See the tables above (from 2 tables). The answer is same.

#### Create external table purchases

In [14]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_purchase(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False



raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

purchase_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_purchase('raw'))

extracted_purchase_events = purchase_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_purchase_events.printSchema()
extracted_purchase_events.show()

extracted_purchase_events.registerTempTable("extracted_purchase_events")

spark.sql("""
    create external table purchases
    stored as parquet
    location '/tmp/purchases'
    as
    select * from extracted_purchase_events
""")


root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-07 23:09:...|
|   *

DataFrame[]

#### Create external table joins

In [15]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_join(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'join_guild':
        return True
    return False



raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

join_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_join('raw'))

extracted_join_events = join_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_join_events.printSchema()
extracted_join_events.show()

extracted_join_events.registerTempTable("extracted_join_guilds")

spark.sql("""
    create external table joins
    stored as parquet
    location '/tmp/joins'
    as
    select * from extracted_join_guilds
""")


root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|           timestamp|
+------+-----------------+---------------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|2019-12-07 23:10:...|
|   */*|user1.comcast.com|ApacheBench/2.3|joi

DataFrame[]

#### Run the program below during streaming data

In [25]:
import json

from pyspark.sql.functions import udf, from_json
from pyspark.sql.types import StructType, StructField, StringType


def purchase_sword_event_schema():
    """
    root
    |-- Accept: string (nullable = true)
    |-- Host: string (nullable = true)
    |-- User-Agent: string (nullable = true)
    |-- event_type: string (nullable = true)
    |-- timestamp: string (nullable = true)
    """
    return StructType([
        StructField("Accept", StringType(), True),
        StructField("Host", StringType(), True),
        StructField("User-Agent", StringType(), True),
        StructField("event_type", StringType(), True),
    ])


@udf('boolean')
def is_sword_purchase(event_as_json):
    """udf for filtering events
    """
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False


raw_events = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .load()

sword_purchases = raw_events \
    .filter(is_sword_purchase(raw_events.value.cast('string'))) \
    .select(raw_events.value.cast('string').alias('raw_event'),
            raw_events.timestamp.cast('string'),
            from_json(raw_events.value.cast('string'),
                      purchase_sword_event_schema()).alias('json')) \
    .select('raw_event', 'timestamp', 'json.*')

sink = sword_purchases \
    .writeStream \
    .format("parquet") \
    .option("checkpointLocation", "/tmp/checkpoints_for_sword_purchases") \
    .option("path", "/tmp/sword_purchases") \
    .trigger(processingTime="10 seconds") \
    .start()

sink.awaitTermination()

KeyboardInterrupt: 

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/sword_purchases
```
```
Found 10 items
drwxr-xr-x   - root supergroup          0 2019-12-07 23:59 /tmp/sword_purchases/_spark_metadata
-rw-r--r--   1 root supergroup       2377 2019-12-07 23:59 /tmp/sword_purchases/part-00000-1cd80acb-be29-421f-9504-366ad83ee0fe-c000.snappy.parquet
-rw-r--r--   1 root supergroup       2566 2019-12-07 23:59 /tmp/sword_purchases/part-00000-25b89790-6715-4e04-9085-40c9eba6b838-c000.snappy.parquet
-rw-r--r--   1 root supergroup       2639 2019-12-07 23:57 /tmp/sword_purchases/part-00000-3c2d2b47-55a4-435f-a3a6-5ee971cc0322-c000.snappy.parquet
-rw-r--r--   1 root supergroup       2381 2019-12-07 23:58 /tmp/sword_purchases/part-00000-3dd16d60-045b-4e00-a844-2e50b0df0714-c000.snappy.parquet
-rw-r--r--   1 root supergroup       2513 2019-12-07 23:59 /tmp/sword_purchases/part-00000-418275e2-942a-4aa0-ab1c-4684c32e3429-c000.snappy.parquet
-rw-r--r--   1 root supergroup        688 2019-12-07 23:57 /tmp/sword_purchases/part-00000-4cdf50b0-f6fe-4fa5-866d-a0d778cc96c9-c000.snappy.parquet
-rw-r--r--   1 root supergroup       2378 2019-12-07 23:57 /tmp/sword_purchases/part-00000-4e5a34d5-c0f0-4227-b0ea-7a4849fc8f3e-c000.snappy.parquet
-rw-r--r--   1 root supergroup       2571 2019-12-07 23:58 /tmp/sword_purchases/part-00000-6a559b52-0a73-4e61-9be0-5878fd299e05-c000.snappy.parquet
-rw-r--r--   1 root supergroup       2440 2019-12-07 23:58 /tmp/sword_purchases/part-00000-d83457f0-b2cc-4410-a3eb-53f889f91416-c000.snappy.parquet
```

In [30]:
df = spark.read.parquet('/tmp/sword_purchases')

df.registerTempTable('sword_purchases')

query = "create external table sword_purchase_events stored as parquet location '/tmp/sword_purchase_events' as select * from sword_purchases"

spark.sql(query)

DataFrame[]

#### Q. Would like to see how many events are generated.

In [32]:
spark.sql('select count(*) from sword_purchase_events').show()

+--------+
|count(1)|
+--------+
|    1530|
+--------+



In [34]:
spark.sql('select count(*) from sword_purchases').show()

+--------+
|count(1)|
+--------+
|    1530|
+--------+



>#### 1530

#### Run the program below during streaming data

In [35]:
import json

from pyspark.sql.functions import udf, from_json
from pyspark.sql.types import StructType, StructField, StringType


def join_guild_event_schema():
    """
    root
    |-- Accept: string (nullable = true)
    |-- Host: string (nullable = true)
    |-- User-Agent: string (nullable = true)
    |-- event_type: string (nullable = true)
    |-- timestamp: string (nullable = true)
    """
    return StructType([
        StructField("Accept", StringType(), True),
        StructField("Host", StringType(), True),
        StructField("User-Agent", StringType(), True),
        StructField("event_type", StringType(), True),
    ])


@udf('boolean')
def is_guild_join(event_as_json):
    """udf for filtering events
    """
    event = json.loads(event_as_json)
    if event['event_type'] == 'join_guild':
        return True
    return False


raw_events = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .load()

guild_joins = raw_events \
    .filter(is_guild_join(raw_events.value.cast('string'))) \
    .select(raw_events.value.cast('string').alias('raw_event'),
            raw_events.timestamp.cast('string'),
            from_json(raw_events.value.cast('string'),
                      join_guild_event_schema()).alias('json')) \
    .select('raw_event', 'timestamp', 'json.*')

sink = guild_joins \
    .writeStream \
    .format("parquet") \
    .option("checkpointLocation", "/tmp/checkpoints_for_guild_joins") \
    .option("path", "/tmp/guild_joins") \
    .trigger(processingTime="10 seconds") \
    .start()

sink.awaitTermination()

KeyboardInterrupt: 

In [36]:
df2 = spark.read.parquet('/tmp/guild_joins')

df2.registerTempTable('guild_joins')

query = "create external table guild_join_events stored as parquet location '/tmp/guild_join_events' as select * from guild_joins"

spark.sql(query)

DataFrame[]

#### Q. Would like to see how many guild join events are generated.

In [37]:
spark.sql('select count(*) from guild_join_events').show()

+--------+
|count(1)|
+--------+
|     210|
+--------+



In [38]:
spark.sql('select count(*) from guild_joins').show()

+--------+
|count(1)|
+--------+
|     210|
+--------+



>#### 210

#### Q. Would like to know where the host was.

In [40]:
spark.sql('select distinct(host) from guild_join_events').show()

+-----------------+
|             host|
+-----------------+
|user1.comcast.com|
+-----------------+



In [39]:
spark.sql('select distinct(host) from guild_joins').show()

+-----------------+
|             host|
+-----------------+
|user1.comcast.com|
+-----------------+



>#### user1.comcast.com

In [10]:
import json

from pyspark.sql.functions import udf, from_json
from pyspark.sql.types import StructType, StructField, StringType


def purchase_sword_event_schema():
    """
    root
    |-- Accept: string (nullable = true)
    |-- Host: string (nullable = true)
    |-- User-Agent: string (nullable = true)
    |-- event_type: string (nullable = true)
    |-- timestamp: string (nullable = true)
    """
    return StructType([
        StructField("Accept", StringType(), True),
        StructField("Host", StringType(), True),
        StructField("User-Agent", StringType(), True),
        StructField("event_type", StringType(), True),
    ])


@udf('boolean')
def is_sword_purchase(event_as_json):
    """udf for filtering events
    """
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False

raw_events = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .load()

sword_purchases = raw_events \
    .filter(is_sword_purchase(raw_events.value.cast('string'))) \
    .select(raw_events.value.cast('string').alias('raw_event'),
            raw_events.timestamp.cast('string'),
            from_json(raw_events.value.cast('string'),
                      purchase_sword_event_schema()).alias('json')) \
    .select('raw_event', 'timestamp', 'json.*')

query = sword_purchases \
    .writeStream \
    .format("console") \
    .start()

query.awaitTermination()


KeyboardInterrupt: 