In [5]:
import json
import gzip
import os
from collections import Counter
from github import Github
import datetime
import sys
import pickle
from matplotlib import pyplot as plt
import pandas as pd

In [6]:
## load arvhive data
events = []
# File from: wget http://data.gharchive.org/2018-07-06-{0..23}.json.gz
# i.e. archive for 24 hours on 7/16/18
for hour in range(0, 24):
    with gzip.open('/Users/amy/git/MetaGithub/data/2018-07-16-%d.json.gz' %hour) as f:
        for line in f:
            events.append(json.loads(line.strip()))

In [7]:
Counter((event['type'] for event in events))

Counter({'CommitCommentEvent': 2262,
         'CreateEvent': 183861,
         'DeleteEvent': 30330,
         'ForkEvent': 38550,
         'GollumEvent': 6843,
         'IssueCommentEvent': 93487,
         'IssuesEvent': 48759,
         'MemberEvent': 6089,
         'PublicEvent': 1402,
         'PullRequestEvent': 81249,
         'PullRequestReviewCommentEvent': 29417,
         'PushEvent': 698124,
         'ReleaseEvent': 5771,
         'WatchEvent': 107964})

In [21]:
events[0]

{'actor': {'avatar_url': 'https://avatars.githubusercontent.com/u/8587468?',
  'display_login': 'dhruvdhody-huawei',
  'gravatar_id': '',
  'id': 8587468,
  'login': 'dhruvdhody-huawei',
  'url': 'https://api.github.com/users/dhruvdhody-huawei'},
 'created_at': '2018-07-16T00:00:00Z',
 'id': '7968171066',
 'payload': {'before': '28d1b1ea1ac223abc32f10615481774665fbc374',
  'commits': [{'author': {'email': 'c1299ee609067fe82b3838a8d655fb28525d1b02@gmail.com',
     'name': 'Dhruv Dhody'},
    'distinct': True,
    'message': 'Add files via upload',
    'sha': '966109e8910165dc2e16e4831596cff2249ebc7b',
    'url': 'https://api.github.com/repos/dhruvdhody-huawei/102/commits/966109e8910165dc2e16e4831596cff2249ebc7b'}],
  'distinct_size': 1,
  'head': '966109e8910165dc2e16e4831596cff2249ebc7b',
  'push_id': 2721067372,
  'ref': 'refs/heads/master',
  'size': 1},
 'public': True,
 'repo': {'id': 139835450,
  'name': 'dhruvdhody-huawei/102',
  'url': 'https://api.github.com/repos/dhruvdhody-hu

In [22]:
# get all watch events
# The WatchEvent is related to starring a repository, not watching. 
starred = [event for event in events if event['type'] == 'WatchEvent']

# count how many times each repo was starred during this time
# maps repo name --> count of star event
repo2count = {}

for event in starred:
    
    repo_name = event['repo']['name']
    
    if repo_name not in repo2count:
        repo2count[repo_name] = 1
    else:
        repo2count[repo_name] += 1


In [4]:
# get all commit comment events
comment_events = [event for event in events if event['type'] == 'CommitCommentEvent']

# aggregate comments for each repo in this timeframe
# maps repo_name --> list of comments made during this time 
repo2comments = {}

for event in comment_events:
    
    repo_name = event['repo']['name']
    comment = event['payload']['comment']['body']
    
    if repo_name not in repo2comments:
        repo2comments[repo_name] = [comment]
    else:
        repo2comments.get(repo_name).append(comment)
    
    

In [6]:
# save repo2comment dictionary
my_file = open('/Users/amy/git/MetaGithub/data/repo2commitComments.7.16.2018.pickle', 'wb')
pickle.dump(repo2comments, my_file)
my_file.close()

In [23]:
# record the names of repos that were starred more than k times during this time
popular_repos = []
k = 10

for repo_name in repo2count:
    if repo2count[repo_name] > k:
        popular_repos.append(repo_name)
        
len(popular_repos)

1032

In [222]:
# for each popular repo, count the number of (+1, -1, laugh, confused, heart, hooray, no_reaction, num_issues) on issues
# Exceeds API rate limit on github

# github object using my personal access token
g = Github('dc346ff7262b4b55dcc8ea6abe35c9c3170d415b')

# map repo name --> counts of [+1, -1, laugh, confused, heart, hooray, no_reaction, num_issues]
repo2info = {}
my_date = datetime.datetime(year = 2018, month = 7, day = 1) # starting time from which to collect data 

for repo_name in popular_repos:
    
    # get the repository
    repo = g.get_repo(repo_name)
    
    # get the issues for this repository
    issue_comments = repo.get_issues_comments(since = my_date)
    
    # initialize count dictionary for this repo
    repo2info[repo_name] = {'+1':0, '-1':0, 'laugh':0, 'confused':0, 'heart':0, 'hooray':0, 'no_reaction': 0, 'num_comments_with_no_reaction':0, 'num_comments':0}

    # for each issue, get reactions
    for comment in issue_comments:
        
        info = repo2info[repo_name]
        
        # get reactions for this repo
        reactions = comment.get_reactions()
        
        # count each reaction
        for r in reactions:
            info[r.content] += 1
            
        # count number of comments without any reactions
        if len(reactions._PaginatedListBase__elements) == 0:
            info['num_comments_with_no_reaction'] += 1
            
        # increment number of comments
        info['num_comments'] += 1

            

In [9]:
# compare avg sentiment vs. n_stars

my_file = open('/Users/amy/git/MetaGithub/data/repo2sentiment.7.16.2018.pickle', 'rb')
repo2sentiment = pickle.load(my_file)

In [18]:
# make data frame
names = []
n_stars = []
avg_sent = []
n_comments = []
for repo_name in repo2sentiment.keys():
    
    names.append(repo_name)
    if repo_name in repo2count:
        n_stars.append(repo2count[repo_name])
    else:
        n_stars.append(0)
        
    
    count = repo2sentiment[repo_name]
    total_sum = 0
    n = 0
    for key in count:
        total_sum += (count[key] * int(key))
        n += count[key]
    if n != 0:
        avg_sent.append(total_sum / n)
    else:
        avg_sent.append(None)
    n_comments.append(n)
    
df = pd.DataFrame({'name': names, 'n_stars':n_stars, 'n_commit_comments': n_comments, 'avg_commit_comment_sentiment':avg_sent})

In [20]:
df.head()
df.to_csv('/Users/amy/git/MetaGithub/data/df.7.16.17.csv')