## W205 - Fall 2019 - Project 3 - Understanding User Behavior

### Nobu Yamaguchi

#### Background
- I am a data scientist at a game development company. 
- Our latest mobile game has two events we are interested in tracking: `buy a sword` & `join guild`
- Each has metadata characteristic of such events.

In [3]:
import json

#### Read from kafka

In [3]:
raw_events = spark.read.format("kafka").option("kafka.bootstrap.servers", "kafka:29092").option("subscribe","events").option("startingOffsets", "earliest").option("endingOffsets", "latest").load() 

#### Explore out events

In [4]:
events = raw_events.select(raw_events.value.cast('string'))

In [5]:
extracted_events = events.rdd.map(lambda x: json.loads(x.value)).toDF()



In [6]:
extracted_events.show()

+------+--------------+-----------+--------------+
|Accept|          Host| User-Agent|    event_type|
+------+--------------+-----------+--------------+
|   */*|localhost:5000|curl/7.47.0|       default|
|   */*|localhost:5000|curl/7.47.0|purchase_sword|
|   */*|localhost:5000|curl/7.47.0|purchase_knife|
|   */*|localhost:5000|curl/7.47.0| purchase_frog|
|   */*|localhost:5000|curl/7.47.0|    ride_horse|
|   */*|localhost:5000|curl/7.47.0|climb_mountain|
|   */*|localhost:5000|curl/7.47.0|    ride_horse|
|   */*|localhost:5000|curl/7.47.0| purchase_frog|
+------+--------------+-----------+--------------+



In [7]:
extracted_events \
        .write \
        .parquet("/tmp/extracted_events")

#### Transform events

In [9]:
@udf('string')
def munge_event(event_as_json):
    event = json.loads(event_as_json)
    event['Host'] = "moe"
    event['Cache-Control'] = "no-cache"
    return json.dumps(event)

raw_events = spark \
        .read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:29092") \
        .option("subscribe", "events") \
        .option("startingOffsets", "earliest") \
        .option("endingOffsets", "latest") \
        .load()

munged_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .withColumn('munged', munge_event('raw'))
munged_events.show()

extracted_events = munged_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.munged))) \
    .toDF()
extracted_events.show()

extracted_events \
    .write \
    .mode("overwrite") \
    .parquet("/tmp/extracted_events")

+--------------------+--------------------+--------------------+
|                 raw|           timestamp|              munged|
+--------------------+--------------------+--------------------+
|{"Host": "localho...|2019-11-10 05:04:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:05:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:06:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:07:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:07:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:07:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:41:...|{"Host": "moe", "...|
|{"Host": "localho...|2019-11-10 05:42:...|{"Host": "moe", "...|
+--------------------+--------------------+--------------------+

+------+-------------+----+-----------+--------------+--------------------+
|Accept|Cache-Control|Host| User-Agent|    event_type|           timestamp|
+------+-------------+----+-----------+--------------+-------------

#### Separate events

In [10]:
# separate.py
import json

from pyspark.sql import Row
from pyspark.sql.functions import udf

@udf('string')
def munge_event(event_as_json):
    event = json.loads(event_as_json)
    event['Host'] = "moe"
    event['Cache-Control'] = "no-cache"
    return json.dumps(event)

raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

munged_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .withColumn('munged', munge_event('raw'))

extracted_events = munged_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.munged))) \
    .toDF()

sword_purchases = extracted_events \
    .filter(extracted_events.event_type == 'purchase_sword')
sword_purchases.show()
sword_purchases \
    .write \
    .mode("overwrite") \
    .parquet("/tmp/sword_purchases")
    
default_hits = extracted_events \
    .filter(extracted_events.event_type == 'default')
default_hits.show()
default_hits \
    .write \
    .mode("overwrite") \
    .parquet("/tmp/default_hits")

+------+-------------+----+-----------+--------------+--------------------+
|Accept|Cache-Control|Host| User-Agent|    event_type|           timestamp|
+------+-------------+----+-----------+--------------+--------------------+
|   */*|     no-cache| moe|curl/7.47.0|purchase_sword|2019-11-10 05:05:...|
+------+-------------+----+-----------+--------------+--------------------+

+------+-------------+----+-----------+----------+--------------------+
|Accept|Cache-Control|Host| User-Agent|event_type|           timestamp|
+------+-------------+----+-----------+----------+--------------------+
|   */*|     no-cache| moe|curl/7.47.0|   default|2019-11-10 05:04:...|
+------+-------------+----+-----------+----------+--------------------+



#### Filtered event (purchase sword)

In [1]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_purchase(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False


raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

purchase_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_purchase('raw'))

extracted_purchase_events = purchase_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_purchase_events.printSchema()
extracted_purchase_events.show()

extracted_purchase_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/purchases')


root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   *

#### Queries from Spark
(First, I read parquet from '/tmp/purchases'. Next, I ran the sql to select rows where Host is 'user1.comcast.com'. Finally, I showed the summary of dataframe.)

In [2]:
purchases = spark.read.parquet('/tmp/purchases')

purchases.show()

purchases.registerTempTable('purchases')

purchases_by_example2 = spark.sql("select * from purchases where Host = 'user1.comcast.com'")

purchases_by_example2.show()

df = purchases_by_example2.toPandas()

df.describe()

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-11-24 18:08:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_s

Unnamed: 0,Accept,Host,User-Agent,event_type,timestamp
count,10,10,10,10,10
unique,1,1,1,1,10
top,*/*,user1.comcast.com,ApacheBench/2.3,purchase_sword,2019-11-24 18:08:27.796
freq,10,10,10,10,1


In [2]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_purchase(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False


raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

purchase_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_purchase('raw'))

extracted_purchase_events = purchase_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_purchase_events.printSchema()
extracted_purchase_events.show()

extracted_purchase_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/purchases')

root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   *

#### Schema on definition

In [3]:
df = spark.read.parquet('/tmp/purchases')

df.registerTempTable('purchases')

query = "create external table purchase_events stored as parquet location '/tmp/purchase_events' as select * from purchases"

spark.sql(query)

DataFrame[]

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/
```
```
Found 5 items
drwxrwxrwt   - mapred mapred              0 2016-04-06 02:26 /tmp/hadoop-yarn
drwx-wx-wx   - hive   supergroup          0 2019-12-06 04:06 /tmp/hive
drwxrwxrwt   - mapred hadoop              0 2016-04-06 02:28 /tmp/logs
drwxr-xr-x   - root   supergroup          0 2019-12-06 04:21 /tmp/purchase_events
drwxr-xr-x   - root   supergroup          0 2019-12-06 04:18 /tmp/purchases
```

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/purchases/
```
```
Found 2 items
-rw-r--r--   1 root supergroup          0 2019-12-06 04:45 /tmp/purchases/_SUCCESS
-rw-r--r--   1 root supergroup       1657 2019-12-06 04:45 /tmp/purchases/part-00000-0b79c2c0-8a64-4296-a6d7-62774ce4b4df-c000.snappy.parquet
```

#### Add another event filter (ride_horse)

In [4]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_ride(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'ride_horse':
        return True
    return False


raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

purchase_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_ride('raw'))

extracted_purchase_events = purchase_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_purchase_events.printSchema()
extracted_purchase_events.show()

extracted_purchase_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/rides')

root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|           timestamp|
+------+-----------------+---------------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|rid

In [5]:
df2 = spark.read.parquet('/tmp/rides')

df2.registerTempTable('rides')

query = "create external table ride_events stored as parquet location '/tmp/ride_events' as select * from rides"

spark.sql(query)

DataFrame[]

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/
```
```
Found 7 items
drwxrwxrwt   - mapred mapred              0 2016-04-06 02:26 /tmp/hadoop-yarn
drwx-wx-wx   - hive   supergroup          0 2019-12-06 04:39 /tmp/hive
drwxrwxrwt   - mapred hadoop              0 2016-04-06 02:28 /tmp/logs
drwxr-xr-x   - root   supergroup          0 2019-12-06 04:45 /tmp/purchase_events
drwxr-xr-x   - root   supergroup          0 2019-12-06 04:45 /tmp/purchases
drwxr-xr-x   - root   supergroup          0 2019-12-06 04:45 /tmp/ride_events
drwxr-xr-x   - root   supergroup          0 2019-12-06 04:45 /tmp/rides
```

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/rides/
```
```
Found 2 items
-rw-r--r--   1 root supergroup          0 2019-12-06 04:45 /tmp/rides/_SUCCESS
-rw-r--r--   1 root supergroup       1623 2019-12-06 04:45 /tmp/rides/part-00000-669d6a5a-2ca8-4c3a-b90a-0122025dc974-c000.snappy.parquet
```

In [6]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_purchase(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False



raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

purchase_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_purchase('raw'))

extracted_purchase_events = purchase_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_purchase_events.printSchema()
extracted_purchase_events.show()

extracted_purchase_events.registerTempTable("extracted_purchase_events")

spark.sql("""
    create external table purchases
    stored as parquet
    location '/tmp/purchases'
    as
    select * from extracted_purchase_events
""")


root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2019-12-06 04:43:...|
|   *

DataFrame[]

In [7]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf


@udf('boolean')
def is_ride(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'ride_horse':
        return True
    return False



raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

ride_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_ride('raw'))

extracted_ride_events = ride_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()
extracted_ride_events.printSchema()
extracted_ride_events.show()

extracted_ride_events.registerTempTable("extracted_ride_horses")

spark.sql("""
    create external table rides
    stored as parquet
    location '/tmp/rides'
    as
    select * from extracted_ride_horses
""")


root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-----------------+---------------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|           timestamp|
+------+-----------------+---------------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|ride_horse|2019-12-06 04:43:...|
|   */*|user1.comcast.com|ApacheBench/2.3|rid

DataFrame[]

In [10]:
import json

from pyspark.sql.functions import udf, from_json
from pyspark.sql.types import StructType, StructField, StringType


def purchase_sword_event_schema():
    """
    root
    |-- Accept: string (nullable = true)
    |-- Host: string (nullable = true)
    |-- User-Agent: string (nullable = true)
    |-- event_type: string (nullable = true)
    |-- timestamp: string (nullable = true)
    """
    return StructType([
        StructField("Accept", StringType(), True),
        StructField("Host", StringType(), True),
        StructField("User-Agent", StringType(), True),
        StructField("event_type", StringType(), True),
    ])


@udf('boolean')
def is_sword_purchase(event_as_json):
    """udf for filtering events
    """
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False

raw_events = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .load()

sword_purchases = raw_events \
    .filter(is_sword_purchase(raw_events.value.cast('string'))) \
    .select(raw_events.value.cast('string').alias('raw_event'),
            raw_events.timestamp.cast('string'),
            from_json(raw_events.value.cast('string'),
                      purchase_sword_event_schema()).alias('json')) \
    .select('raw_event', 'timestamp', 'json.*')

query = sword_purchases \
    .writeStream \
    .format("console") \
    .start()

query.awaitTermination()


KeyboardInterrupt: 

In [None]:
import json

from pyspark.sql.functions import udf, from_json
from pyspark.sql.types import StructType, StructField, StringType


def purchase_sword_event_schema():
    """
    root
    |-- Accept: string (nullable = true)
    |-- Host: string (nullable = true)
    |-- User-Agent: string (nullable = true)
    |-- event_type: string (nullable = true)
    |-- timestamp: string (nullable = true)
    """
    return StructType([
        StructField("Accept", StringType(), True),
        StructField("Host", StringType(), True),
        StructField("User-Agent", StringType(), True),
        StructField("event_type", StringType(), True),
    ])


@udf('boolean')
def is_sword_purchase(event_as_json):
    """udf for filtering events
    """
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False


raw_events = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .load()

sword_purchases = raw_events \
    .filter(is_sword_purchase(raw_events.value.cast('string'))) \
    .select(raw_events.value.cast('string').alias('raw_event'),
            raw_events.timestamp.cast('string'),
            from_json(raw_events.value.cast('string'),
                      purchase_sword_event_schema()).alias('json')) \
    .select('raw_event', 'timestamp', 'json.*')

sink = sword_purchases \
    .writeStream \
    .format("parquet") \
    .option("checkpointLocation", "/tmp/checkpoints_for_sword_purchases") \
    .option("path", "/tmp/sword_purchases") \
    .trigger(processingTime="10 seconds") \
    .start()

sink.awaitTermination()

```
$ docker-compose exec cloudera hadoop fs -ls /tmp/sword_purchases
```
```
Found 3 items
drwxr-xr-x   - root supergroup          0 2019-12-06 05:27 /tmp/sword_purchases/_spark_metadata
-rw-r--r--   1 root supergroup        688 2019-12-06 05:27 /tmp/sword_purchases/part-00000-ad0bdd9f-fb91-4f41-a067-27c4b736e019-c000.snappy.parquet
-rw-r--r--   1 root supergroup       2459 2019-12-06 05:27 /tmp/sword_purchases/part-00000-ccedace1-506b-4560-bb92-3bf6b7b23868-c000.snappy.parquet
```