In [1]:
import json
import gzip
import os
from collections import Counter
import datetime
import pickle
import sys
import pandas as pd
from pycorenlp import StanfordCoreNLP

In [2]:
## read JSON data
# change the file path accordingly
# test data: file from wget http://data.gharchive.org/2018-07-16-15.json.gz
# i.e. archive for 24 hours on 7/16/18

######### CHANGE ME #############
# specify date and hours for the archive data; assumes that the data are downloaded in the given path
# also assumes the data is from july, 2018
my_path = '/Users/amy/git/MetaGithub/data/' 
date_start = 16 # inclusive
date_end = 17 # exclusive
hour_start = 15 # inclusive
hour_end = 16 # exclusive
#################################

events = []

# for every date
for date in range(date_start, date_end):
    
    # format date string so that 1 is encoded as '01', etc.
    my_date = str(date) if date > 9 else '0'+ str(date)
    
    # for every hour
    for hour in range(hour_start, hour_end):
        
        with gzip.open('%s2018-07-%s-%d.json.gz' %(my_path, my_date, hour)) as f:
            for line in f:
                events.append(json.loads(line.strip()))

In [3]:
## get all commit comment events
comment_events = [event for event in events if event['type'] == 'CommitCommentEvent']

# aggregate comments for each repo
# maps repo_name --> list of comments made during this time 
repo2comments = {}

for event in comment_events:
    
    repo_name = event['repo']['name']
    comment = event['payload']['comment']['body']
    
    if repo_name not in repo2comments:
        repo2comments[repo_name] = [comment]
    else:
        repo2comments.get(repo_name).append(comment)
    

In [4]:
## initialize stanford NLP server
# make sure you have downloaded the stanford nlp package. Try: pip install pycorenlp
# before executing this cell, go to your_path/stanford-corenlp-full-2018-02-27/ and run
# java -mx5g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 10000

nlp = StanfordCoreNLP('http://localhost:9000')


# toy example for using stanford NLP
res = nlp.annotate("I love you. I hate him. You are nice. He is dumb",
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 10000,
                   })
for s in res["sentences"]:
    print("%d: '%s': %s %s" % (
        s["index"],
        " ".join([t["word"] for t in s["tokens"]]),
        s["sentimentValue"], s["sentiment"]))

0: 'I love you .': 3 Positive
1: 'I hate him .': 1 Negative
2: 'You are nice .': 3 Positive
3: 'He is dumb': 1 Negative


In [None]:
## compute the mean sentiment value for all comments for each repo
# this is pretty slow
# sentiment value range: 0 (very negative), 1 (negative), 2 (neutral), 3 (positive), 4 (very positive)
# maps repo_name --> average sentiment
repo2sentiment = {}

i = 0 # count number of repos that have been processed

# for every repo
for repo_name in repo2comments.keys():

    # track progress
    if i % 50 == 0:
        print(i)
    
    # init count dictionary
    count = {'0':0, '1':0, '2':0, '3':0, '4':0}

    # for every comment in the repository
    for comment in repo2comments[repo_name]:
        
        # run sentiment analysis
        res = nlp.annotate(comment, properties={'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 10000})

        # skip if timeout has occurred
        if res == 'CoreNLP request timed out. Your document may be too long.':
            continue
            
        # add up sentiment values
        for s in res["sentences"]:
            count[s["sentimentValue"]] += 1

        
    # set dicitonary
    repo2sentiment[repo_name] = count # missing value
    
    i += 1



0
50


In [6]:
# make data frame with the following columns: repo_name, n_commit_comments, avg_sentiment
names = []
avg_sent = []
n_comments = []
for repo_name in repo2sentiment.keys():
    
    names.append(repo_name)
    
    count = repo2sentiment[repo_name]
    total_sum = 0
    n = 0
    for key in count:
        total_sum += (count[key] * int(key))
        n += count[key]
    if n != 0:
        avg_sent.append(total_sum / n)
    else:
        avg_sent.append(None)
    n_comments.append(n)
    
df = pd.DataFrame({'name': names, 'n_commit_comments': n_comments, 'avg_commit_comment_sentiment':avg_sent})
df.head()

Unnamed: 0,avg_commit_comment_sentiment,n_commit_comments,name
0,1.142857,7,armbian/build
1,1.4,10,Soulofdestiny/os-autoinst-distri-opensuse
2,1.0,2,ilonazakharova/java_course
3,1.333333,9,RAIL-UGA/rail-uga-stellar
4,3.0,1,rathena/rathena


In [58]:
# save dataframe as csv file
out_path = '/Users/amy/git/MetaGithub/sentiment.2018.7.16.15.csv'
df.to_csv(out_path)