#### Imports and Data extracts from GitHub-Archive

In [2]:
import requests
import gzip
import json
import traceback


from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [3]:
# events_list is a list of dicts to store all the json files from GitHub-Archive
events_list = []


def extract_data():
  # extracts json.gz from GHArchive and loads events_list with the data
  
  for hour in range(12):
    url = 'http://data.gharchive.org/2015-01-01-'+str(hour)+'.json.gz'
    r = requests.get(url)
    file_path = "./"+url.split("/")[-1]

    with open(file_path, 'wb') as f:
        f.write(r.content)

    with gzip.open(file_path, 'rb') as f:
        for line in f:
            event = json.loads(line.decode("utf-8"))
            events_list.append(event)
  return


In [4]:
extract_data()

###### master_df : Master Dataframe with the raw data from GitHub extract.

In [6]:
# To create dataframe directly from events_list, we need to create schema explicitly.
# The events_list is nested with more than 7 layers deep and hundreds of fields.
# Instead dumping it as a json in DBFS (DataBricks File System) and creating 
# dataframe from json automatically infers the schema. 

# dumping archive extract into DBFS
file_path = "/dbfs/mnt/2015-01-01.json"   
with open(file_path, 'w+') as fout:
  json.dump(events_list, fout)

# creating dataframe from json files 
file_path = "dbfs:/mnt/2015-01-01.json"
master_df = spark.read.format("json").load(file_path)

In [7]:
# master_df.printSchema()
# master_df.count()

#### Table 1: main_df
###### main_df is created from master_df with these columns:

* actor_id
* org_id
* event_id
* created_at
* repo_id
* event_type

In [9]:
main_df = master_df.select(F.col("actor.id").alias("actor_id"), 
                           F.col("org.id").alias("org_id"), 
                           F.col("id").alias("event_id"), 
                           F.col("created_at"), 
                           F.col("repo.id").alias("repo_id"), 
                           F.col("type").alias("event_type"))
# main_df.show()

##### Data wrangling & exploration.
* Changed the datatypes of a few columns
* split the timestamp into hour column. 
* Imputed missing values with appropriate values for string and integer fields

In [11]:
main_df = main_df.withColumn("created_time", F.split(F.col("created_at"), "T").getItem(1))

main_df = main_df.withColumn("created_hour", F.split(F.col("created_time"), ":").getItem(0))\
                .drop("created_at")\
                .drop("created_time")

main_df = main_df.withColumn("actor_id", main_df["actor_id"].cast(T.LongType()))\
                 .withColumn("org_id", main_df["org_id"].cast(T.LongType()))\
                 .withColumn("event_id", main_df["event_id"].cast(T.LongType()))\
                 .withColumn("repo_id", main_df["repo_id"].cast(T.LongType()))\
                 .withColumn("created_hour", main_df["created_hour"].cast(T.IntegerType()))
        
main_df = main_df.fillna({'actor_id': -1, 
                          'org_id': -1, 
                          'event_id': -1, 
                          'repo_id': -1, 
                          'event_type': "Missing", 
                          'created_hour': -1})

# print(main_df.show())
# print(main_df.printSchema())
# print(main_df.describe().show())


##### Data Aggregation on main_df

In [13]:
print("Unique Actors: ", main_df.select('actor_id').distinct().count())
print("Unique Orgs: ", main_df.select('org_id').distinct().count())
print("Unique Repos: ", main_df.select('repo_id').distinct().count())
print("Unique Event Types: ", main_df.select('event_type').distinct().count())

event_types = [i for i in main_df.select('event_type').distinct().collect()]
print(event_types)

In [14]:
main_df.groupby("event_type")\
       .agg(F.count("event_id"))\
       .show()

In [15]:
# percentage of org_users out of total user base

print((main_df.filter(main_df.org_id != -1).select('actor_id').distinct().count()/main_df.select('actor_id').distinct().count())*100)

In [16]:
#percent of repos owned by organiazations out of total repos.

print((main_df.filter(main_df.org_id != -1).select('repo_id').distinct().count()/main_df.select('repo_id').distinct().count())*100)

#### Table 2: org_df
####### org_df has the columns:
* id
* name
* url

In [18]:
org_df = master_df.select(F.col("org.id").alias("org_id"), 
                          F.col("org.login").alias("name"), 
                          F.col("org.url").alias("org_url"))\
                  .dropDuplicates(["org_id"])

org_df = org_df.fillna({'org_id': -1, 
                        'name': "Missing", 
                        'org_url': "Missing"})

# org_df.printSchema()
# org_df.describe().show()
# org_df.sample(fraction = 0.01).show()

#### payload_df
###### payload_df is the most important table that has data about repos, event payloads with 19 fields and hundreds of nested fields

In [20]:
# extract all fields and nested fields of payload_df to explore fields of interest
payload_df = master_df.select(F.col("id").alias("event_id"), 
                              F.col("payload.*"))

# payload_df.printSchema()
# payload_df.show()
# payload_df.describe().show()

#### Table 3: repo_df
###### From payload_df, repository details are of interest to us. 
###### So repo_df with below columns is created
* event_id
* repo_name
* repo_id
* watchers_count
* forks_count
* language_count
* pull_req_id

In [22]:
repo_df = payload_df.select(F.col("event_id").alias("event_id"),
                            F.col("pull_request.base.repo.name").alias("repo_name"),
                            F.col("pull_request.base.repo.owner.organizations_url").alias("org_url"),
                            F.col("pull_request.base.repo.id").alias("repo_id"),
                            F.col("pull_request.base.repo.watchers_count"),
                            F.col("pull_request.base.repo.forks_count"),
                            F.col("pull_request.base.repo.language"),
                            F.col("pull_request.id").alias("pull_req_id")
                            ).dropna()

repo_df = repo_df.withColumn("event_id", repo_df["event_id"].cast(T.LongType()))

# repo_df.show()
# repo_df.printSchema()
# repo_df.describe().show()

In [23]:
repo_df.describe().show()

##### Aggregating and exploring the repo_df

In [25]:
print("Unique Pull Reqs: ", repo_df.select(F.col("pull_req_id")).distinct().count())
print("Unique Repos: ", repo_df.select(F.col("repo_id")).distinct().count())
print("Unique Events: ", repo_df.select(F.col("event_id")).distinct().count())
print("Unique Languages: ", repo_df.select(F.col("language")).distinct().count())

languages = [ i for i in repo_df.select(F.col("language")).distinct().collect()]
# print(languages)

###### Creating top500_repos_df from repo_df, main_df, org_df

In [27]:
repo_org_df = main_df.select(F.col("repo_id"), F.col("org_id")).dropDuplicates()

repo_org_df.createOrReplaceTempView("repo_org")
repo_df.createOrReplaceTempView("repo")
   
temp1 = spark.sql("select o.org_id, r.* from repo r INNER JOIN repo_org o ON r.repo_id == o.repo_id")

In [28]:
temp1.createOrReplaceTempView("temp1")
org_df.createOrReplaceTempView("org")

temp2 = spark.sql("select o.name, t.* from temp1 t INNER JOIN org o ON t.org_id == o.org_id")

In [29]:
top500_repos_df = temp2.where(F.col("name")!= "Missing")\
                       .groupby("repo_id")\
                       .agg({'watchers_count':'sum', 'forks_count':'sum'})\
                       .withColumnRenamed('sum(watchers_count)','watchers_count')\
                       .withColumnRenamed('sum(forks_count)', 'forks_count')\
                       .orderBy('forks_count', ascending = False)\
                       .orderBy('watchers_count', ascending = False)\
                       .limit(500)
              
#top500_repos_df.show()

###### Creating top10_orgs_df from top500_repos_df, org_df and aggregating it by organization

In [31]:
top500_repos_df.createOrReplaceTempView("top500_repos_df")
temp2.createOrReplaceTempView("temp2")

repo_owner_df = spark.sql("select t2.name, t1.* from top500_repos_df t1 INNER JOIN temp2 t2 ON t1.repo_id == t2.repo_id")

In [32]:
top10_orgs_df = repo_owner_df.groupby("name")\
                             .agg({'repo_id':'count'})\
                             .withColumnRenamed('count(repo_id)','popular_repo_count')\
                             .orderBy('popular_repo_count', ascending = False)\
                             .limit(10)

# top10_orgs_df.show()

###### Dumping the DataFrames as csv files to create visualizations in Tableau.

In [34]:
main_df.write.csv('/dbfs/mnt/main.csv')
repo_df.write.csv('/dbfs/mnt/repo.csv')
org_df.write.csv('/dbfs/mnt/org.csv')
top500_repos_df.write.csv('/dbfs/mnt/top500_repos.csv')
top10_orgs_df.write.csv('/dbfs/mnt/top10_orgs.csv')

##### A few event related questions can be answered from the main_df table 


* ###### There are close to 20+ types of GitHub events possible (as per the GitHub archive data). In the 12 hour data analyzed, there are 14 distinct events. 
* ###### The Data summary on repo, user, organization and event level can also be captured.
* ###### What are the top events by volume? 
* ###### Does the top 5 events change across org_users and individual_users?

##### Insights about Active users - individual and org_users - can be answered from main_df and org_df table. 

* ###### There are two types of user segments in GitHub-Archive data (The archive doesn't contain any details about GitHub Enterprise). 
* ###### Org_users are a part of an organization which pays for collaborating with their team through GitHub. Individual users are other type of users who do not pay for the GitHub service. 
* ###### How different is the event activity type and volume amongst org_users and individual_users? 
* ###### Do org_users trigger more events or individual users?

##### Information about popular repositories, languages and repo owners can be found from repo_df.

* ###### What are the most popular repositories in GitHub? 
* ###### How do we decide the popular repositories? 
* ###### Which organizations own the most popular repositories?
* ###### Which languages are popular amongst the developers?