Dataset website : https://ghtorrent.org/downloads.html

In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("GHTorrent_log_analytics").setMaster("local[*]")
sc = SparkContext(conf=conf)
print(sc)

22/11/06 16:54:44 WARN Utils: Your hostname, pc resolves to a loopback address: 127.0.1.1; using 192.168.170.52 instead (on interface wlp3s0)
22/11/06 16:54:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/06 16:54:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/11/06 16:54:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
<SparkContext master=local[*] appName=GHTorrent_log_analytics>


In [2]:
# Read and load data to spark
rdd = sc.textFile("./data/ghtorrent-logs.txt.gz")

In [3]:
# Repartition data
print("Before repartition : ", rdd.getNumPartitions())
rdd = rdd.repartition(8) # shuffle all data
print("After repartition : ", rdd.getNumPartitions())
print(sc.defaultParallelism)

Before repartition :  1
After repartition :  8
8


In [4]:
# Count the number of records and get twenty records randomly
print("Number of records : ", rdd.count())
rdd.takeSample(False, 20, 1234)

                                                                                

Number of records :  9669788


                                                                                

['DEBUG, 2017-03-23T12:01:39+00:00, ghtorrent-3 -- retriever.rb: Commit aselimkaya/AddingJsonFileToDB -> 57d70de5dc5c9346cfd2a567c449caab0b5fe31b exists',
 'DEBUG, 2017-03-23T10:30:37+00:00, ghtorrent-8 -- ght_data_retrieval.rb: Processing event: PullRequestEvent-5531790485',
 'WARN, 2017-03-23T11:03:54+00:00, ghtorrent-29 -- ghtorrent.rb: Found user MCFBDBIA with same email greenkeeper[bot]@users.noreply.github.com as non existing user greenkeeper[bot]. Assigning user greenkeeper[bot] to MCFBDBIA',
 'INFO, 2017-03-23T10:34:58+00:00, ghtorrent-28 -- api_client.rb: Successful request. URL: https://api.github.com/repos/camphor-/hanreki/pulls/8/commits?per_page=100, Remaining: 298, Total: 96 ms',
 'WARN, 2017-03-23T10:44:45+00:00, ghtorrent-43 -- ght_data_retrieval.rb: Error processing event. Type: CreateEvent, ID: 5531188992, Time: 51461 ms',
 'DEBUG, 2017-03-23T11:11:11+00:00, ghtorrent-29 -- retriever.rb: Commit arungmca/deeplearning4j -> 73ce276ce08d5e4adde55d949105834cc3008217 exists

In [5]:
# Get the number of lines with both Transaction and Repo information

import re
def collect_words(line):
    return re.compile('\w+').findall(line.lower())

rdd_transaction = rdd.filter(lambda x: "transaction" in collect_words(x))
rdd_repo = rdd.filter(lambda x: "repo" in collect_words(x))

rdd_intersect = rdd_transaction.intersection(rdd_repo)
rdd_intersect.count()

                                                                                

19

In [6]:
rdd_intersect.collect()

                                                                                

['DEBUG, 2017-03-23T09:13:17+00:00, ghtorrent-9 -- ghtorrent.rb: Repo xuminwlt/tcc-transaction exists',
 'DEBUG, 2017-03-23T09:13:16+00:00, ghtorrent-9 -- ghtorrent.rb: Repo xuminwlt/tcc-transaction exists',
 'DEBUG, 2017-03-23T09:13:17+00:00, ghtorrent-9 -- retriever.rb: Repo changmingxie -> tcc-transaction exists',
 'DEBUG, 2017-03-23T09:27:31+00:00, ghtorrent-24 -- ghtorrent.rb: Repo changmingxie/tcc-transaction exists',
 'DEBUG, 2017-03-23T13:03:33+00:00, ghtorrent-42 -- ghtorrent.rb: Repo jwpttcg66/redis-game-transaction exists',
 'DEBUG, 2017-03-23T09:13:26+00:00, ghtorrent-9 -- ghtorrent.rb: Repo xuminwlt/tcc-transaction exists',
 'DEBUG, 2017-03-23T11:09:37+00:00, ghtorrent-1 -- ghtorrent.rb: Repo pilkyoon/spring-transaction exists',
 'DEBUG, 2017-03-23T09:13:16+00:00, ghtorrent-9 -- ghtorrent.rb: Repo changmingxie/tcc-transaction exists',
 'INFO, 2017-03-23T09:13:16+00:00, ghtorrent-9 -- retriever.rb: Added repo xuminwlt -> tcc-transaction',
 'DEBUG, 2017-03-23T09:13:17+00:00,

In [7]:
# Get the number of lines including we link for warn logging levels

def get_urls(line):
    regex = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    return re.findall(regex, line)

rdd.filter(lambda x: x.split(",")[0] == 'WARN') \
    .filter(lambda x: len(get_urls(x)) > 1 ) \
    .count()


                                                                                

57

In [8]:
# The most active downloader id for Failed connections
rdd_failed =  rdd.filter(lambda x: 'failed' in collect_words(x))

rdd_active = rdd_failed.map(lambda x: (x.replace(' --',',').split(',')[2].split('-')[1],1))
rdd_active.reduceByKey(lambda x,y: x+y).sortBy(lambda x:x[1], ascending=False).first()

                                                                                

('13', 79654)

In [9]:
# The most active repository

def get_words(line):
    return re.compile(' \w+ ').findall(line.lower())
    
rdd.filter(lambda x: ' repo ' in get_words(x)) \
    .map(lambda x: (x.lower().split('repo ')[1].split(" ")[0],1)) \
    .reduceByKey(lambda x,y: x+y) \
    .sortBy(lambda x: x[1], ascending= False) \
    .first()

                                                                                

('ovyx/hammerheadn', 22447)

In [10]:
# Get the number of Failed HTTP requests per hour.
def get_hour(line):
    return line.split(", ")[1].split("T")[1].split(":")[0]

rdd.filter(lambda x: 'failed' in collect_words(x)) \
            .map(lambda x: (get_hour(x),1)) \
            .reduceByKey(lambda x,y: x+y) \
            .sortByKey() \
            .collect()

                                                                                

[('00', 5088),
 ('01', 5088),
 ('02', 98),
 ('03', 98),
 ('04', 100),
 ('05', 100),
 ('06', 98),
 ('07', 100),
 ('08', 100),
 ('09', 5325),
 ('10', 9991),
 ('11', 10567),
 ('12', 7500),
 ('13', 5712),
 ('14', 5144),
 ('15', 5090),
 ('16', 5090),
 ('17', 5090),
 ('18', 5090),
 ('19', 5092),
 ('20', 5772),
 ('21', 5091),
 ('22', 5092),
 ('23', 5088)]