## Take in raw data from kafka to spark

In [1]:
assessments_kafka = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:29092") \
  .option("subscribe", "assessments") \
  .option("startingOffsets", "earliest") \
  .option("endingOffsets", "latest") \
  .load()

In [2]:
assessments_kafka.cache()

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

## Show raw data schema

In [3]:
assessments_kafka.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



## Check raw data type

In [4]:
type(assessments_kafka)

pyspark.sql.dataframe.DataFrame

## Check topic and value

In [5]:
assessments_kafka.show()

+----+--------------------+-----------+---------+------+--------------------+-------------+
| key|               value|      topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----------+---------+------+--------------------+-------------+
|null|[7B 22 6B 65 65 6...|assessments|        0|     0|1969-12-31 23:59:...|            0|
|null|[7B 22 6B 65 65 6...|assessments|        0|     1|1969-12-31 23:59:...|            0|
|null|[7B 22 6B 65 65 6...|assessments|        0|     2|1969-12-31 23:59:...|            0|
|null|[7B 22 6B 65 65 6...|assessments|        0|     3|1969-12-31 23:59:...|            0|
|null|[7B 22 6B 65 65 6...|assessments|        0|     4|1969-12-31 23:59:...|            0|
|null|[7B 22 6B 65 65 6...|assessments|        0|     5|1969-12-31 23:59:...|            0|
|null|[7B 22 6B 65 65 6...|assessments|        0|     6|1969-12-31 23:59:...|            0|
|null|[7B 22 6B 65 65 6...|assessments|        0|     7|1969-12-31 23:59:...|   

## Import pyspark libraries

In [6]:
import sys 
import json
from pyspark.sql import Row
from pyspark.sql.functions import from_json, col, lit, countDistinct, avg, col
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, LongType


## Cast values as strings

In [7]:
assessments_true = assessments_kafka.select(assessments_kafka.value.cast('string'))

## Extract Json data

In [8]:
assessments_exr = spark.read.json(assessments_true.rdd.map(lambda x: x.value))

In [9]:
assessments_exr.show(1)

+--------------------+-------------+--------------------+-----------------+--------------------+-----------------+------------+--------------------+--------------------+--------------------+
|        base_exam_id|certification|           exam_name|  keen_created_at|             keen_id|   keen_timestamp|max_attempts|           sequences|          started_at|        user_exam_id|
+--------------------+-------------+--------------------+-----------------+--------------------+-----------------+------------+--------------------+--------------------+--------------------+
|37f0a30a-7464-11e...|        false|Normal Forms and ...|1516717442.735266|5a6745820eb8ab000...|1516717442.735266|         1.0|[1,[false,2,1,1,4...|2018-01-23T14:23:...|6d4089e4-bde5-4a2...|
+--------------------+-------------+--------------------+-----------------+--------------------+-----------------+------------+--------------------+--------------------+--------------------+
only showing top 1 row



## The structure from the previous show() function is not easy to read. Contruct/register a table for better schema structure to view.

In [10]:
assessments_exr.registerTempTable('assessments_exr')
df_asmt = assessments_exr

In [11]:
df_asmt.printSchema()

root
 |-- base_exam_id: string (nullable = true)
 |-- certification: string (nullable = true)
 |-- exam_name: string (nullable = true)
 |-- keen_created_at: string (nullable = true)
 |-- keen_id: string (nullable = true)
 |-- keen_timestamp: string (nullable = true)
 |-- max_attempts: string (nullable = true)
 |-- sequences: struct (nullable = true)
 |    |-- attempt: long (nullable = true)
 |    |-- counts: struct (nullable = true)
 |    |    |-- all_correct: boolean (nullable = true)
 |    |    |-- correct: long (nullable = true)
 |    |    |-- incomplete: long (nullable = true)
 |    |    |-- incorrect: long (nullable = true)
 |    |    |-- submitted: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |    |-- unanswered: long (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- questions: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- options: arra

## Now query the nested json data to answer some basic questions:
- How many assesstments are in the dataset?


In [12]:
df_asmt.count()

3280

- How many people took Learning Git?

In [13]:
# check if learning git is in exam name
df_asmt.select("exam_name").show(10)

+--------------------+
|           exam_name|
+--------------------+
|Normal Forms and ...|
|Normal Forms and ...|
|The Principles of...|
|The Principles of...|
|Introduction to B...|
|        Learning Git|
|Git Fundamentals ...|
|Introduction to P...|
|Intermediate Pyth...|
|Introduction to P...|
+--------------------+
only showing top 10 rows



In [14]:
# filter and count learning git frequency

df_asmt.filter(df_asmt["exam_name"] == "Learning Git").count()


394

- What is the least common course taken? And the most common?

In [15]:
df_asmt.groupBy("exam_name").count().sort(col("count").desc()).show(truncate=False)

+--------------------------------------------------------------+-----+
|exam_name                                                     |count|
+--------------------------------------------------------------+-----+
|Learning Git                                                  |394  |
|Introduction to Python                                        |162  |
|Introduction to Java 8                                        |158  |
|Intermediate Python Programming                               |158  |
|Learning to Program with R                                    |128  |
|Introduction to Machine Learning                              |119  |
|Software Architecture Fundamentals Understanding the Basics   |109  |
|Beginning C# Programming                                      |95   |
|Learning Eclipse                                              |85   |
|Learning Apache Maven                                         |80   |
|Beginning Programming with JavaScript                         |79   |
|Maste

- Which exam has the highest unanswered rate 

In [16]:
# try using spark.sql to calculate percentages

spark.sql("select exam_name, \
            sum(sequences.counts.unanswered)/sum(sequences.counts.total) \
            as per_unanswered \
            from assessments_exr \
            group by exam_name \
            order by per_unanswered desc"\
          ).show(20)

+--------------------+--------------------+
|           exam_name|      per_unanswered|
+--------------------+--------------------+
|       View Updating|                0.25|
|Learning Spring P...|                0.25|
|Native Web Apps f...|                0.25|
|Mastering Advance...| 0.17647058823529413|
|Networking for Pe...| 0.13333333333333333|
|          Great Bash| 0.12857142857142856|
|Hibernate and JPA...|               0.125|
|Learning C# Best ...|                0.12|
|Amazon Web Servic...|  0.1111111111111111|
|Design Patterns i...| 0.10666666666666667|
|              TCP/IP| 0.08571428571428572|
|Cloud Computing W...| 0.07352941176470588|
|Software Architec...| 0.07339449541284404|
|Amazon Web Servic...| 0.06818181818181818|
|Learning C# Desig...| 0.06521739130434782|
|Learning Apache C...|              0.0625|
|        Learning Git| 0.06040609137055838|
|Python Data Struc...|  0.0603448275862069|
|JavaScript: The G...|  0.0603448275862069|
|Learning iPython ...|0.05882352

- Which exam has the highest imcomplete rate

In [17]:
spark.sql("select exam_name, \
            sum(sequences.counts.incomplete)/sum(sequences.counts.total) \
            as per_incomp \
            from assessments_exr \
            group by exam_name \
            order by per_incomp desc"\
          ).show(20)

+--------------------+-------------------+
|           exam_name|         per_incomp|
+--------------------+-------------------+
|       View Updating|                0.5|
|Building Web Serv...| 0.4166666666666667|
|  Learning Java EE 7|                0.4|
|Web & Native Work...|              0.375|
|I'm a Software Ar...|0.36666666666666664|
|Cloud Computing W...|0.35294117647058826|
|Arduino Prototypi...| 0.3333333333333333|
| Mastering Web Views| 0.3333333333333333|
|Introduction to A...|0.30952380952380953|
|Amazon Web Servic...| 0.3055555555555556|
|Modeling for Soft...|               0.25|
|Normal Forms and ...|               0.25|
|Introduction to D...|0.23920265780730898|
|The Principles of...|0.22727272727272727|
|Architectural Con...|               0.22|
|Design Patterns i...|0.21333333333333335|
|An Introduction t...|                0.2|
|Client-Side Data ...|                0.2|
|SQL: Beyond the B...|0.18181818181818182|
|Event-Driven Micr...|               0.18|
+----------

- Which exams are the most difficult (lowest correct rate)

In [18]:
spark.sql("select exam_name, \
            sum(sequences.counts.correct)/sum(sequences.counts.total) \
            as per_cort \
            from assessments_exr \
            group by exam_name \
            order by per_cort asc"\
          ).show(20)

+--------------------+-------------------+
|           exam_name|           per_cort|
+--------------------+-------------------+
|Example Exam For ...|               null|
|Client-Side Data ...|                0.2|
|Native Web Apps f...|               0.25|
|       View Updating|               0.25|
|Arduino Prototypi...| 0.3333333333333333|
|Mastering Advance...| 0.3602941176470588|
|           Nullology|              0.375|
|Building Web Serv...| 0.4166666666666667|
|Web & Native Work...| 0.4166666666666667|
| Mastering Web Views| 0.4166666666666667|
|Cloud Computing W...| 0.4264705882352941|
|         Offline Web| 0.4358974358974359|
|Learning C# Best ...|0.46285714285714286|
|Design Patterns i...| 0.4666666666666667|
|Software Architec...| 0.4793577981651376|
|  Learning Java EE 7|               0.48|
|Data Visualizatio...|0.49193548387096775|
|Using Web Components|                0.5|
|Amazon Web Servic...|                0.5|
|Learning Data Str...|                0.5|
+----------

- Which courses have the higherest average attempts

In [19]:
spark.sql("select exam_name, \
            avg(max_attempts)\
            as avg_max_atp \
            from assessments_exr \
            group by exam_name \
            order by avg_max_atp desc"\
          ).show(20)

+--------------------+-----------+
|           exam_name|avg_max_atp|
+--------------------+-----------+
|Learning Spring P...|        1.0|
|Networking for Pe...|        1.0|
|Learning iPython ...|        1.0|
|Introduction to P...|        1.0|
|Introduction to A...|        1.0|
|Learning Data Mod...|        1.0|
|Introduction to J...|        1.0|
|Learning Apache H...|        1.0|
|Learning C# Best ...|        1.0|
|Mastering Python ...|        1.0|
|Introduction to B...|        1.0|
|       View Updating|        1.0|
|A Practical Intro...|        1.0|
|Introduction to A...|        1.0|
|Intermediate C# P...|        1.0|
|I'm a Software Ar...|        1.0|
|JavaScript Templa...|        1.0|
|        Learning DNS|        1.0|
|Starting a Grails...|        1.0|
|Being a Better In...|        1.0|
+--------------------+-----------+
only showing top 20 rows



- Which questions are the most difficult

In [20]:
# show data structure and content
temp = spark.sql("select sequences.questions.id as q_id, \
            sequences.questions.user_correct \
            from assessments_exr" \
          ).show(5)

+--------------------+--------------------+
|                q_id|        user_correct|
+--------------------+--------------------+
|[7a2ed6d3-f492-49...|[false, false, tr...|
|[95194331-ac43-45...|[true, false, fal...|
|[b9ff2e88-cf9d-4b...|[false, true, tru...|
|[1f7c5def-904b-48...|[true, false, tru...|
|[620c924f-6bd8-11...|[false, true, tru...|
+--------------------+--------------------+
only showing top 5 rows



One could unwrap the user_correct col and count the ture vs. false ratio and determien the most difficult questions. 

- Who take more than one courses

In [21]:
spark.sql("select user_exam_id, \
            count(exam_name) as course_count \
            from assessments_exr \
            group by user_exam_id \
            order by course_count desc" \
          ).show(10)

+--------------------+------------+
|        user_exam_id|course_count|
+--------------------+------------+
|6132da16-2c0c-436...|           3|
|cdc5859d-b332-4fb...|           3|
|66d91177-c436-4ee...|           3|
|028ad26f-a89f-4a6...|           3|
|fa23b287-0d0a-468...|           3|
|b7ac6d15-97e1-4e9...|           3|
|a45b5ee6-a4ed-4b1...|           3|
|d4ab4aeb-1368-486...|           3|
|bd96cfbe-1532-4ba...|           3|
|a7e6fc04-245f-4e3...|           3|
+--------------------+------------+
only showing top 10 rows



Looks like some users took as much as 3 courses.

## Save into HDFS

In [22]:
df_asmt.write.parquet("/tmp/assessments_hdfs")

In [None]:
# ran this in terminal: 
docker-compose exec cloudera hadoop fs -ls /tmp/

# and get: 
#drwxr-xr-x   - root   supergroup          0 2021-03-08 05:43 /tmp/assessments_hdfs
#drwxrwxrwt   - mapred mapred              0 2018-02-06 18:27 /tmp/hadoop-yarn
#drwx-wx-wx   - root   supergroup          0 2021-03-08 05:40 /tmp/hive

In [27]:
# test read
sqlContext.read.parquet("/tmp/assessments_hdfs").show(10) 

+--------------------+-------------+--------------------+------------------+--------------------+------------------+------------+--------------------+--------------------+--------------------+
|        base_exam_id|certification|           exam_name|   keen_created_at|             keen_id|    keen_timestamp|max_attempts|           sequences|          started_at|        user_exam_id|
+--------------------+-------------+--------------------+------------------+--------------------+------------------+------------+--------------------+--------------------+--------------------+
|37f0a30a-7464-11e...|        false|Normal Forms and ...| 1516717442.735266|5a6745820eb8ab000...| 1516717442.735266|         1.0|[1,[false,2,1,1,4...|2018-01-23T14:23:...|6d4089e4-bde5-4a2...|
|37f0a30a-7464-11e...|        false|Normal Forms and ...| 1516717377.639827|5a674541ab6b0a000...| 1516717377.639827|         1.0|[1,[false,1,2,1,4...|2018-01-23T14:21:...|2fec1534-b41f-441...|
|4beeac16-bb83-4d5...|        false