In [12]:
import json
import pandas as pd
import gzip
import os
from collections import Counter
from github import Github
import datetime

In [10]:
events = []
# File from: wget http://data.gharchive.org/2018-07-06-{0..23}.json.gz
# i.e. archive for 24 hours on 7/16/18
for hour in range(0, 24):
    with gzip.open('/Users/amy/git/MetaGithub/data/2018-07-16-%d.json.gz' %hour) as f:
        for line in f:
            events.append(json.loads(line.strip()))

In [11]:
Counter((event['type'] for event in events))

Counter({'CommitCommentEvent': 2262,
         'CreateEvent': 183861,
         'DeleteEvent': 30330,
         'ForkEvent': 38550,
         'GollumEvent': 6843,
         'IssueCommentEvent': 93487,
         'IssuesEvent': 48759,
         'MemberEvent': 6089,
         'PublicEvent': 1402,
         'PullRequestEvent': 81249,
         'PullRequestReviewCommentEvent': 29417,
         'PushEvent': 698124,
         'ReleaseEvent': 5771,
         'WatchEvent': 107964})

In [69]:
events[0];

In [13]:
# get all watch events
# The WatchEvent is related to starring a repository, not watching. 
starred = [event for event in events if event['type'] == 'WatchEvent']

# count how many times each repo was starred during this time
# maps repo name --> count of star event
repo2count = {}

for s in starred:
    repo_name = s['repo']['name']
    
    if repo_name not in repo2count:
        repo2count[repo_name] = 1
    else:
        repo2count[repo_name] += 1


In [103]:
# record the names of repos that were starred more than k times during this time
popular_repos = []
k = 1000

for repo_name in repo2count:
    if repo2count[repo_name] > k:
        popular_repos.append(repo_name)
        
len(popular_repos)

2

In [106]:
# for each popular repo, count the number of (+1, -1, laugh, confused, heart, hooray, no_reaction, num_issues) on issues

# github object using my personal access token
g = Github('dc346ff7262b4b55dcc8ea6abe35c9c3170d415b')

# map repo name --> counts of [+1, -1, laugh, confused, heart, hooray, no_reaction, num_issues]
repo2info = {}
my_date = datetime.datetime(year = 2018, month = 6, day = 1) # starting time from which to collect data 

for repo_name in popular_repos:
    
    # get the repository
    repo = g.get_repo(repo_name)
    
    # get the issues for this repository
    issues = repo.get_issues(since = my_date)
    
    # initialize count dictionary for this repo
    repo2info[repo_name] = {'+1':0, '-1':0, 'laugh':0, 'confused':0, 'heart':0, 'hooray':0, 'no_reaction': 0, 'num_issues':0}

    # for each issue, get reactions
    for issue in issues:
        
        info = repo2info[repo_name]
        
        # get reactions for this repo
        reactions = issue.get_reactions()
        
        # count each reaction
        for r in reactions:
            info[r.content] += 1
            
        # no reactions found
        if len(reactions._PaginatedListBase__elements) == 0:
            info['no_reaction'] += 1
            
        # increment number of issues 
        info['num_issues'] += 1

            

In [107]:
repo2info

{'KieSun/InterviewMap': {'+1': 3,
  '-1': 0,
  'confused': 0,
  'heart': 0,
  'hooray': 0,
  'laugh': 0,
  'no_reaction': 7,
  'num_issues': 10},
 'donnemartin/system-design-primer': {'+1': 26,
  '-1': 0,
  'confused': 0,
  'heart': 0,
  'hooray': 0,
  'laugh': 0,
  'no_reaction': 11,
  'num_issues': 18}}