# Docker Code

Here is the whole process of my project with all the used code included without all the mistakes and noise that is included in the history.txt

## Create Docker-Compose.yml

In [None]:
#create file
touch docker-compose.ysl

#edit file
vi docker-compose.ysl

#feel free to bring in images once created and edited (or copy paste a old file and then do pulls.

docker pull midsw205/base:latest
docker pull midsw205/base:0.1.8
docker pull midsw205/base:0.1.9
docker pull redis
docker pull confluentinc/cp-zookeeper:latest
docker pull confluentinc/cp-kafka:latest
docker pull midsw205/spark-python:0.0.5
docker pull midsw205/spark-python:0.0.6
docker pull midsw205/cdh-minimal:latest
docker pull midsw205/hadoop:0.0.2
docker pull midsw205/presto:0.0.1

## Docker Pulls

In [None]:
#make directory
mkdir ~/w205/project-2-oscarcasas

#cd into directory
cd ~/w205/project-2-oscarcasas

#add datafile
curl -L -o assessment-attempts-20180128-121051-nested.json https://goo.gl/ME6hjp

#Prune network, if still not removed reset instance
docker network ls

#check network
docker network ls

#spin dicker up
docker-compose up -d

RETURN**
Creating network "project-2-oscarcasas_default" with the default driver
Creating project-2-oscarcasas_zookeeper_1 ... done
Creating project-2-oscarcasas_cloudera_1  ... done
Creating project-2-oscarcasas_mids_1      ... done
Creating project-2-oscarcasas_spark_1     ... done
Creating project-2-oscarcasas_kafka_1     ... done

#check containers
docker-compose ps

RETURN**
project-2-oscarcasas_cloudera_1    cdh_startup_script.sh       Up      11000/tcp, 11443/tcp, 19888/tcp, 50070/tcp, 8020/tcp,    
                                                                       8088/tcp, 8888/tcp, 9090/tcp                             
project-2-oscarcasas_kafka_1       /etc/confluent/docker/run   Up      29092/tcp, 9092/tcp                                      
project-2-oscarcasas_mids_1        /bin/bash                   Up      8888/tcp                                                 
project-2-oscarcasas_spark_1       docker-entrypoint.sh bash   Up                                                               
project-2-oscarcasas_zookeeper_1   /etc/confluent/docker/run   Up      2181/tcp, 2888/tcp, 32181/tcp, 3888/tcp

## Kafka | Zookeeper | Docker

In [None]:
#Check logs (a lot of logs)
docker-compose logs -f kafka

#Check hadoop
docker-compose exec cloudera hadoop fs -ls /tmp/

RETURN**
Found 2 items
drwxrwxrwt   - mapred mapred              0 2018-02-06 18:27 /tmp/hadoop-yarn
drwx-wx-wx   - root   supergroup          0 2021-11-05 04:51 /tmp/hive

#Create topic Exams from data
docker-compose exec kafka kafka-topics --create --topic Exams --partitions 1 --replication-factor 1 --if-not-exists --bootstrap-server localhost:29092

RETURNS**
Created topic Exams.

#Publish messages to kafka via kafkacat no limiter on number
docker-compose exec mids bash -c "cat assessment-attempts-20180128-121051-nested.json | jq '.[]' -c | kafkacat -P -b kafka:29092 -t Exams"

#Consume the messages 
docker-compose exec mids bash -c "kafkacat -b kafka:29092 -t Exams"

RESULT**
Whole JSON File

# Call pyspark off docker container
docker-compose exec spark pyspark

## Pyspark

In [4]:
#read in kafka messages that were consumed by Exams topic
raw_exams = spark.read.format("kafka").option("kafka.bootstrap.servers", /
"kafka:29092").option("subscribe","Exams").option("startingOffsets", /
"earliest").option("endingOffsets", "latest").load()

#cache it
raw_exams.cache()

#investigate schema
raw_exams.printSchema()

#Cast each message as a string
exams = raw_exams.select(raw_exams.value.cast('string'))

#RETURNS**
#DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

#write to parquet
exams.write.parquet("/tmp/exams")






SyntaxError: invalid syntax (2677391188.py, line 2)

In [None]:
#from another terminal window look at it on hadoop
docker-compose exec cloudera hadoop fs -ls /tmp/

RESULTS***

Found 3 items
drwxr-xr-x   - root   supergroup          0 2021-11-05 05:07 /tmp/exams
drwxrwxrwt   - mapred mapred              0 2018-02-06 18:27 /tmp/hadoop-yarn
drwx-wx-wx   - root   supergroup          0 2021-11-05 04:51 /tmp/hive
    

docker-compose exec cloudera hadoop fs -ls /tmp/exams/

RESULTS***

Found 2 items
-rw-r--r--   1 root supergroup          0 2021-11-05 05:07 /tmp/exams/_SUCCESS
-rw-r--r--   1 root supergroup    2513397 2021-11-05 05:07 /tmp/exams/part-00000-5ccf8a3d-8d71-44a0-8749-b7931511a5da-c00
0.snappy.parquet

In [5]:
#back in the original terminal
exams.show()

RESULTS**
+--------------------+
|               value|
+--------------------+
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
|{"keen_timestamp"...|
+--------------------+
only showing top 20 rows
  
#Not useful at all need more investigating
print(exams.head())
  
RESULTS***
  
Row(value='{"keen_timestamp":"1516717442.735266","max_attempts":"1.0","started_at":"2018-01-23T14:23:19.082Z","base_exam_id":"37
f0a30a-7464-11e6-aa92-a8667f27e5dc","user_exam_id":"6d4089e4-bde5-4a22-b65f-18bce9ab79c8","sequences":{"questions":[{"user_incom
plete":true,"user_correct":false,"options":[{"checked":true,"at":"2018-01-23T14:23:24.670Z","id":"49c574b4-5c82-4ffd-9bd1-c3358f
af850d","submitted":1,"correct":true},{"checked":true,"at":"2018-01-23T14:23:25.914Z","id":"f2528210-35c3-4320-acf3-9056567ea19f
","submitted":1,"correct":true},{"checked":false,"correct":true,"id":"d1bf026f-554f-4543-bdd2-54dcf105b826"}],"user_submitted":t
rue,"id":"7a2ed6d3-f492-49b3-b8aa-d080a8aad986","user_result":"missed_some"},{"user_incomplete":false,"user_correct":false,"opti
ons":[{"checked":true,"at":"2018-01-23T14:23:30.116Z","id":"a35d0e80-8c49-415d-b8cb-c21a02627e2b","submitted":1},{"checked":fals
e,"correct":true,"id":"bccd6e2e-2cef-4c72-8bfa-317db0ac48bb"},{"checked":true,"at":"2018-01-23T14:23:41.791Z","id":"7e0b639a-2ef
8-4604-b7eb-5018bd81a91b","submitted":1,"correct":true}],"user_submitted":true,"id":"bbed4358-999d-4462-9596-bad5173a6ecb","user
_result":"incorrect"},{"user_incomplete":false,"user_correct":true,"options":[{"checked":false,"at":"2018-01-23T14:23:52.510Z","
id":"a9333679-de9d-41ff-bb3d-b239d6b95732"},{"checked":false,"id":"85795acc-b4b1-4510-bd6e-41648a3553c9"},{"checked":true,"at":"
2018-01-23T14:23:54.223Z","id":"c185ecdb-48fb-4edb-ae4e-0204ac7a0909","submitted":1,"correct":true},{"checked":true,"at":"2018-0
1-23T14:23:53.862Z","id":"77a66c83-d001-45cd-9a5a-6bba8eb7389e","submitted":1,"correct":true}],"user_submitted":true,"id":"e6ad8
644-96b1-4617-b37b-a263dded202c","user_result":"correct"},{"user_incomplete":false,"user_correct":true,"options":[{"checked":fal
se,"id":"59b9fc4b-f239-4850-b1f9-912d1fd3ca13"},{"checked":false,"id":"2c29e8e8-d4a8-406e-9cdf-de28ec5890fe"},{"checked":false,"
id":"62feee6e-9b76-4123-bd9e-c0b35126b1f1"},{"checked":true,"at":"2018-01-23T14:24:00.807Z","id":"7f13df9c-fcbe-4424-914f-2206f1
06765c","submitted":1,"correct":true}],"user_submitted":true,"id":"95194331-ac43-454e-83de-ea8913067055","user_result":"correct"
}],"attempt":1,"id":"5b28a462-7a3b-42e0-b508-09f3906d1703","counts":{"incomplete":1,"submitted":4,"incorrect":1,"all_correct":fa
lse,"correct":2,"total":4,"unanswered":0}},"keen_created_at":"1516717442.735266","certification":"false","keen_id":"5a6745820eb8
ab00016be1f1","exam_name":"Normal Forms and All That Jazz Master Class"}')

#impor json
import json

#look at one value
Peak = json.loads(exams.select('value').take(1)[0].value)
    
#investigate
Peak['sequences']['counts']

RETURNS**
{'incomplete': 1, 'submitted': 4, 'incorrect': 1, 'all_correct': False, 'correct': 2, 'total': 4, 'unanswered': 0}
    
#further
Peak['sequences']['counts']['correct']
    
RETURNS**
2

#Deal with unicode
import sys
sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf8', buffering=1)
    
#check
print(sys.stdout.encoding)

RETURNS**
utf8
    

SyntaxError: invalid syntax (442237244.py, line 4)

In [None]:
#NOTE:This led me through a very long journey to nowhere, I have no doubt there is a way to do it 
#I just couldn't figure it out and will reflect that in my report.
#This solution creates a easier (slower run if big data) to interpret list of lists which can be converted to a dataframe easily
df = pd.DataFrame(columns=['Class', 'User_id', 'Correct', 'Total'])
for i in range(exams.count()):
    df.loc[i] = [json.loads(exams.select('value').take(3280)[i+1].value)['exam_name'],
               json.loads(exams.select('value').take(3280)[i+1].value)['user_exam_id'],
               json.loads(exams.select('value').take(3280)[i+1].value)['sequences']['counts']['correct'],
               json.loads(exams.select('value').take(3280)[i+1].value)['sequences']['counts']['total']]
    print(i)

#Make Parquet File delete
parquet_df = spark.createDataFrame(df,["Class","User_id","Correct","Total"])

#Write Parquet
parquet_df.write.parquet("/tmp/pass_class")

#Investigate
parquet_df.show()


+--------------------+--------------------+-------+-----+
|               Class|             User_id|Correct|Total|
+--------------------+--------------------+-------+-----+
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
|Normal Forms and ...|6d4089e4-bde5-4a2...|      2|    4|
+--------------------+--------------------+-------+-----+

#clearly it was summing up by the wrong iterable in the json, but we can just fix that by grouping by the User_id


parquet_df.registerTempTable('commits')



In [None]:
docker-compose exec cloudera hadoop fs -ls /tmp/
Found 5 items
drwxr-xr-x   - root   supergroup          0 2021-11-05 05:59 /tmp/ex_exams
drwxr-xr-x   - root   supergroup          0 2021-11-05 05:07 /tmp/exams
drwxrwxrwt   - mapred mapred              0 2018-02-06 18:27 /tmp/hadoop-yarn
drwx-wx-wx   - root   supergroup          0 2021-11-05 04:51 /tmp/hive
drwxr-xr-x   - root   supergroup          0 2021-11-05 07:27 /tmp/pass_class
jupyter@tensorflow-2-3-20210907-151006:~/w205/project-2-oscarcasas$ docker-compose exec cloudera hadoop fs -ls /tmp/p
ass_class
Found 5 items
-rw-r--r--   1 root supergroup          0 2021-11-05 07:27 /tmp/pass_class/_SUCCESS
-rw-r--r--   1 root supergroup       1441 2021-11-05 07:27 /tmp/pass_class/part-00000-e9cf588c-802c-4068-857c-1422eb3
d7c4f-c000.snappy.parquet
-rw-r--r--   1 root supergroup       1441 2021-11-05 07:27 /tmp/pass_class/part-00001-e9cf588c-802c-4068-857c-1422eb3
d7c4f-c000.snappy.parquet
-rw-r--r--   1 root supergroup       1441 2021-11-05 07:27 /tmp/pass_class/part-00002-e9cf588c-802c-4068-857c-1422eb3
d7c4f-c000.snappy.parquet
-rw-r--r--   1 root supergroup       1441 2021-11-05 07:27 /tmp/pass_class/part-00003-e9cf588c-802c-4068-857c-1422eb3
d7c4f-c000.snappy.parquet

In [None]:
exams.rdd.map(lambda x: json.loads(exams.select('value').take(x)[0].value)['exam_name']).toDF().show()

In [None]:
import json

exams.rdd.map(lambda x: json.loads(x.value)).toDF().show()

ex_exams = exams.rdd.map(lambda x: json.loads(x.value)).toDF()

#output big dataframes with a lot of information we dont need

ex_exams.printSchema()

RESULT**

|-- base_exam_id: string (nullable = true)
 |-- certification: string (nullable = true)
 |-- exam_name: string (nullable = true)
 |-- keen_created_at: string (nullable = true)
 |-- keen_id: string (nullable = true)
 |-- keen_timestamp: string (nullable = true)
 |-- max_attempts: string (nullable = true)
 |-- sequences: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: map (containsNull = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: boolean (valueContainsNull = true)
 |-- started_at: string (nullable = true)
 |-- user_exam_id: string (nullable = true)

#save to parquet
ex_exams.write.parquet("/tmp/ex_exams")

#Registed the table 
ex_exams.registerTempTable('exam_com')

#after all queries are coded exit pyspark and close docker-compose
exit()

docker-compose down