# Home Exercises
Complete Pyspark Exercises to comprehend and learn Spark Basics. Using the Github events dataset (https://api.github.com/events) 


###  Get JSON data from GitHub RESTful service using Python

In [127]:
import requests

json_data =[]
for page in range(1, 4):
    response = requests.get("https://api.github.com/events", params={'page': page})
    json_data+=response.json()

### Task: Write JSON data to .json file
file name: github_events.json

Please provide the code for the following task:

In [145]:
import json

with open('github_events.json', 'w') as f:
    json.dump(json_data, f)
    f.write('\n')

## Import Pyspark package

In [146]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

### Initialize logs 

In [147]:
import logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger('Spark Home Exercises')

### Initialize SparkSession

In [132]:
spark = SparkSession.builder\
	.appName("Spark Exercise")\
	.config("spark.some.config.option", "some-value")\
	.enableHiveSupport()\
	.getOrCreate()

### Read the Github events json file github_events.json

In [133]:
df = spark.read.json("github_events.json")

### Task: Show the existing schema on the current DataFrame

Please provide the code for the following task:

In [134]:
logger.info("Let's take a look at the general schema.")
df.printSchema()

2019-07-14 01:33:30,437 - Spark Exercise - Main File - INFO - Let's take a look at the general schema.


root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- display_login: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- org: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- action: string (nullable = true)
 |    |-- before: string (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- html: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- pull_request: struct (nul

### Task: Print all the data
Please provide the code for the following task:

In [140]:
logger.info("Initial dataframe:")
df.show()

2019-07-14 01:36:32,270 - Spark Exercise - Main File - INFO - Initial dataframe:


+--------------------+--------------------+-----------+--------------------+-----------------------+------+--------------------+----------------+
|               actor|          created_at|         id|                 org|                payload|public|                repo|            type|
+--------------------+--------------------+-----------+--------------------+-----------------------+------+--------------------+----------------+
|[https://avatars....|2019-07-13T22:28:19Z|10006379157|                null|   [, 88d7f73fc1d7f1...|  true|[178435208, ablak...|       PushEvent|
|[https://avatars....|2019-07-13T22:28:19Z|10006379156|                null|   [,,,,,,,,, master...|  true|[196771631, TomRo...|     CreateEvent|
|[https://avatars....|2019-07-13T22:28:19Z|10006379155|                null|   [, 14803e30e03a2c...|  true|[196738859, crarn...|       PushEvent|
|[https://avatars....|2019-07-13T22:28:19Z|10006379154|                null|   [, 3606aff0fc22f1...|  true|[196685910, dong-

### Task: Create a DataFrame and register as a temporary view 'rawTable'
Please provide the code for the following task:

In [136]:
df.createOrReplaceTempView("rawTable")

### Extract every events that has as type==PullRequestEvent in 'rawTable' view

In [141]:
q1_sql_statement = """SELECT * FROM rawTable WHERE type = 'PullRequestEvent'"""
q1_df = spark.sql(q1_sql_statement)
q1_df.show()

+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+----------------+
|               actor|          created_at|         id|                 org|             payload|public|                repo|            type|
+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+----------------+
|[https://avatars....|2019-07-13T22:28:19Z|10006379140|                null|[opened,,,,,,,,,,...|  true|[169997356, sheki...|PullRequestEvent|
|[https://avatars....|2019-07-13T22:28:18Z|10006379124|                null|[opened,,,,,,,,,,...|  true|[90206982, y-take...|PullRequestEvent|
|[https://avatars....|2019-07-13T22:28:18Z|10006379120|                null|[closed,,,,,,,,,,...|  true|[169802845, Hoish...|PullRequestEvent|
|[https://avatars....|2019-07-13T22:28:17Z|10006379089|                null|[closed,,,,,,,,,,...|  true|[164262051, jonyb...|PullRequestEvent|


### Clean the dataset, selecting only these fields for each event: 
* created_at as created_at, repo.name as repo_name, actor.login as username, 
* payload.pull_request.user.login as pr_username
* payload.pull_request.created_at as pr_created_at, 
* payload.pull_request.head.repo.language as pr_repo_language

In [138]:
q2_df = q1_df.select(
	F.col("created_at"),
	F.col("repo.name").alias("repo_name"), 
	F.col("actor.login").alias("username"), 
	F.col("payload.pull_request.user.login").alias("pr_username"), 
	F.col("payload.pull_request.created_at").alias("pr_created_at"), 
	F.col("payload.pull_request.head.repo.language").alias("pr_repo_language"))

q2_df.show(20)

+--------------------+--------------------+---------------+---------------+--------------------+----------------+
|          created_at|           repo_name|       username|    pr_username|       pr_created_at|pr_repo_language|
+--------------------+--------------------+---------------+---------------+--------------------+----------------+
|2019-07-13T22:28:19Z|        shekit/peeqo|           ascg|           ascg|2019-07-13T22:28:19Z|      JavaScript|
|2019-07-13T22:28:18Z|    y-takey/qiitaros|dependabot[bot]|dependabot[bot]|2019-07-13T22:28:18Z|      JavaScript|
|2019-07-13T22:28:18Z|Hoishin/eslint-co...|        Hoishin|  renovate[bot]|2019-07-13T22:18:05Z|      JavaScript|
|2019-07-13T22:28:17Z|jonybur/f1-teleme...|        jonybur|        jonybur|2019-07-13T22:14:32Z|      TypeScript|
|2019-07-13T22:28:16Z|HelloClick/helloc...|treyssatvincent|treyssatvincent|2019-07-13T21:39:34Z|             PHP|
|2019-07-13T22:28:14Z|          us3r/sigma|      pull[bot]|      pull[bot]|2019-07-13T22

### For each event, add another field, called pr_repo_language_type, based on the following criteria:
* Procedural -> Basic, C
* Object Oriented -> C#, C++, Java, Python,
* Functional -> Lisp, Haskell, Scala
* Data Science -> R, Jupyter Notebook, Julia
* Others -> contains all the other languages that are not mention above

In [142]:
q3_df = q2_df.withColumn('pr_repo_language_type', 
                         F.when((F.col('pr_repo_language') == 'Basic') | (F.col('pr_repo_language') == 'C'),
                                F.lit('Procedural')).otherwise(F.when(
                             (F.col('pr_repo_language') == 'C#') | (F.col('pr_repo_language') == 'C++') | ( 
                                 F.col('pr_repo_language') == 'Java') | (F.col('pr_repo_language') == 'Python'),
                             F.lit('Object Oriented')).otherwise(F.when(
                             (F.col('pr_repo_language') == 'Lisp') | (F.col('pr_repo_language') == 'Haskell') | (
                                 F.col('pr_repo_language') == 'Scala'), F.lit('Functional')).otherwise(F.when(
                             (F.col('pr_repo_language') == 'R') | (F.col('pr_repo_language') == 'Jupyter Notebook') | (
                                 F.col('pr_repo_language') == 'Julia'), F.lit('Data Science')).otherwise(F.lit('Others'))))))
q3_df.show(40)

+--------------------+--------------------+---------------+---------------+--------------------+----------------+---------------------+
|          created_at|           repo_name|       username|    pr_username|       pr_created_at|pr_repo_language|pr_repo_language_type|
+--------------------+--------------------+---------------+---------------+--------------------+----------------+---------------------+
|2019-07-13T22:28:19Z|        shekit/peeqo|           ascg|           ascg|2019-07-13T22:28:19Z|      JavaScript|               Others|
|2019-07-13T22:28:18Z|    y-takey/qiitaros|dependabot[bot]|dependabot[bot]|2019-07-13T22:28:18Z|      JavaScript|               Others|
|2019-07-13T22:28:18Z|Hoishin/eslint-co...|        Hoishin|  renovate[bot]|2019-07-13T22:18:05Z|      JavaScript|               Others|
|2019-07-13T22:28:17Z|jonybur/f1-teleme...|        jonybur|        jonybur|2019-07-13T22:14:32Z|      TypeScript|               Others|
|2019-07-13T22:28:16Z|HelloClick/helloc...|treys