# Exploratory Data Analysis - GitHub repo's issues

In [418]:
import json
import numpy as np
import pandas as pd
import datetime


import warnings
warnings.filterwarnings('ignore')

# # Suppress pandas's warning
# warnings.simplefilter(action='ignore', category=Warning)

## https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
# pd.options.mode.chained_assignment = None  # default='warn'


## Data requirements

Below is a list of interested data that we will use in the initial analysis
* How long the issues open
* First comment response time
* Number of comments
* Number of participants
* Labels associated with each issue
* Milestone

Later will be done in the later part - Find data for each year where
* Top words in question
* Top words in answers
* Top participants + Type and company

## Cleaning data - High-level data

* Extract only interested columns which are
[
'issue_id',
'title',
'contents',
'authorLogin',
'authorAssociation',
'createdAt',
'closedAt',
'closed',
'comments_count',
'comments_data',
'participants_count',
'labels',
'milestone'
]

* Cast datetime columns to ['createdAt', 'closedAt'] datetime 
* Create a new column 'closedDuration_days' to store a timediff between the columns ['closedAt', 'createdAt'] in days
* Additional columns for comments data
  * Create a new column 'firstCommentCreatedAt' to store a datetime object the first comment is created
  * Create a new column 'firstCommentDuration_days' to store a timediff between the columns 'createdAt' and the creation time of the first comment in days
  * New columns for the first comment author info ['firstCommentAuthor', 'firstCommentAuthorAssociation'] 
  * New columns for total number of unique comment authors based on authorAssociation ['num_unique_comment_author_MEMBER', 'num_unique_comment_author_CONTRIBUTOR', 'num_unique_comment_author_COLLABORATOR', 'num_unique_comment_author_NONE']      
* Extract 'milestone' title and replace the 'milestone' column with this value
* Perform one-hot encoding to the label column

In [404]:
def get_comment_info_columns(row):
    '''
    Function to extract data from a dataframe's row that will be used in the function 'create_highlevel_df'
    
    Input argument:
        row: A row of data frame containing github issue info
        
    Output argument:
        a list containing the following information
            [
                'firstCommentCreatedAt', 
                'firstCommentAuthor', 
                'firstCommentAuthorAssociation',
                'num_unique_comment_author_MEMBER',
                'num_unique_comment_author_CONTRIBUTOR',
                'num_unique_comment_author_COLLABORATOR',
                'num_unique_comment_author_NONE'
            ]
        
    '''
    
    # If the comments_count is greater than 0
    if row['comments_count'] > 0:
        
        # --- Extract data for 'firstCommentCreatedAt', 'firstCommentAuthor', 'firstCommentAuthorAssociation'
        
        out = [
            row['comments_data'][0]['createdAt'],
            row['comments_data'][0]['authorLogin'], 
            row['comments_data'][0]['authorAssociation']
        ]
        
        # --- Extract data for 'num_unique_comment_*'
        # We need to extract data from the 'comments_data' column
        
        # Get a list of all comment authors first
        list_comment_authors = []
        for cur_comment in row['comments_data']:
            cur_author = {
                'login': cur_comment['authorLogin'],
                'association': cur_comment['authorAssociation']
            }
            list_comment_authors.append(cur_author)
            
        unique_comment_authors = [dict(y) for y in set(tuple(x.items()) for x in list_comment_authors)]
        
        # Then, get count for each type
        dict_author_assoc = {
            'MEMBER': 0,
            'CONTRIBUTOR': 0,
            'COLLABORATOR': 0,
            'NONE': 0
        }
        
        for cur_comment_author in unique_comment_authors:
            cur_assoc = cur_comment_author['association']
            dict_author_assoc[cur_assoc] += 1
            
        # Append data to the output list
        out = out + [
            dict_author_assoc['MEMBER'],
            dict_author_assoc['CONTRIBUTOR'],
            dict_author_assoc['COLLABORATOR'],
            dict_author_assoc['NONE']
        ]
    else:
        # Otherwise, output a default output list
        out = [None, None, None, 0, 0, 0, 0]

        
    return out

In [405]:
def create_highlevel_df(json_path):
    '''
    Extract github issues data from the specified path and output an extracted summary data as a new dataframe
    '''
    
    # Load json data and create dataframe
    with open(json_path) as json_file:  
        data_raw = json.load(json_file)

    df_raw = pd.DataFrame.from_dict(data_raw)
    
    # Let's get only columns that we need for the analysis now
    new_cols = [
        'issue_id',
        'title',
        'contents',
        'authorLogin',
        'authorAssociation',
        'createdAt',
        'closedAt',
        'closed',
        'comments_count',
        'participants_count',
        'comments_data',
        'labels',
        'milestone'
     ]
    
    df_out = df_raw[new_cols]
    
    # --- Cast columns to datetime
    df_out[['createdAt', 'closedAt']] = df_out[['createdAt', 'closedAt']].apply(pd.to_datetime)
    
    # --- Find closedDuration_days
    get_diff_days = lambda s: (s.dt.total_seconds() / (24 * 60 * 60)).round(2)
#     df_out['closedDuration_days'] = ((df_out['closedAt'] - df_out['createdAt']).dt.total_seconds() / (24 * 60 * 60)).round(2)
    df_out['closedDuration_days'] = get_diff_days(df_out['closedAt'] - df_out['createdAt'])

    # --- Find comment info
    cols_first_comments = [
        'firstCommentCreatedAt', 
        'firstCommentAuthor', 
        'firstCommentAuthorAssociation',
        'num_unique_comment_author_MEMBER',
        'num_unique_comment_author_CONTRIBUTOR',
        'num_unique_comment_author_COLLABORATOR',
        'num_unique_comment_author_NONE'
    ]
    df_out[cols_first_comments] = pd.DataFrame(
                    df_out.apply(lambda row: get_comment_info_columns(row), axis=1).values.tolist(), 
                     index= df_out.index)

    df_out['firstCommentCreatedAt'] = pd.to_datetime(df_out['firstCommentCreatedAt'])
    df_out['firstCommentDuration_days'] = get_diff_days(df_out['firstCommentCreatedAt'] - df_out['createdAt'])
    
    # --- Extract 'milestone' title and replace the 'milestone' colummn with this value
    df_out['milestone'] = df_out['milestone'].apply(lambda x: x['title'] if x else '')
    
    # --- Perform one hot encoding for the labels columns
    
    # Get a list of labels
    list_labels = []
    for index, row in df_out.iterrows():
        list_labels = list_labels + row['labels']

    list_labels = list(set(list_labels))
    
    # Then, create new columns for each of those labels
    
    # Create a dictionary to map those labels with a boolean value for each record in the dataset
    dict_labels = dict((el,[]) for el in list_labels)
    for index, row in df_out.iterrows():
        for cur_label in list_labels:
            cur_bool = False
            if cur_label in row['labels']:
                cur_bool = True

            dict_labels[cur_label].append(cur_bool)
    
    # Then, create a dataframe for it and merge the newly created dataframe with the current output dataframe
    df_out = pd.concat([df_out, pd.DataFrame.from_dict(dict_labels)], axis=1)
    
    # Finally, prepend those newly added label columns with 'Label_'
    dict_rename = {}
    for cur_label in list_labels:
        dict_rename[cur_label] = 'label_' + cur_label
        
    df_out = df_out.rename(columns=dict_rename)

    
    # --- Reorder columns
    final_cols = [
        'issue_id',
        'title',
        'contents',
        'authorLogin',
        'authorAssociation',
        'createdAt',
        'closed',
        'closedAt',        
        'closedDuration_days',
        'milestone',
        'participants_count',
        'comments_count',
        'firstCommentCreatedAt',
        'firstCommentDuration_days',
        'firstCommentAuthor',
        'firstCommentAuthorAssociation',
        'num_unique_comment_author_MEMBER',
        'num_unique_comment_author_CONTRIBUTOR',
        'num_unique_comment_author_COLLABORATOR',
        'num_unique_comment_author_NONE',
        'labels'
    ]
    
    # Then, append the one hot encoding's label columns
    final_cols = final_cols + ['label_' + cur_label for cur_label in list_labels]
    df_out = df_out[final_cols]
    
    return df_out



In [417]:
list_libs = [
    'qunit',
    'mocha',
    'jest',
    'jasmine',
    'funcunit',
    'puppeteer',
    'cypress'
]

for cur_lib in list_libs:
    print('***** [{}] START repo#{}" *****'.format(str(datetime.datetime.now()), cur_lib))
        
    json_path = '../data/github_repo_issues_{}.json'.format(cur_lib)
    df_cur = create_highlevel_df(json_path) 
    
    print('|-- Total records: ', df_cur.shape[0])
    
    # --- Save data to csv
    csv_filename = 'temp/repo_issue_summary_{}.csv'.format(cur_lib)
    print('|-- Save data to "{}"...'.format(csv_filename))
    df_cur.to_csv(csv_filename, index=False)
    
    # --- Save data to xlsx since the csv data is not read properly in Tableau due to a usage of ',' in the contents
    xlsx_filename = "temp/repo_issue_summary_{}.xlsx".format(cur_lib)
    print('|-- Save data to "{}"...'.format(xlsx_filename))
    
    writer = pd.ExcelWriter(xlsx_filename,
                engine='xlsxwriter',
                options={'remove_timezone': True})

    df_cur.to_excel(writer, index=False, sheet_name='Sheet1')

    workbook  = writer.book
    worksheet = writer.sheets['Sheet1']

    writer.save()

    

***** [2019-07-06 13:37:14.132598] START repo#qunit" *****
|-- Total records:  708
|-- Save data to "temp/repo_issue_summary_qunit.csv"...
|-- Save data to "temp/repo_issue_summary_qunit.xlsx"...


Ajax%20urls%20are%20wrapped%20in%20the%20url%20function%20in%20unit/ajax.js.%20%20Currently,%20instead%20of%20constructing%20the%20correct%20url,%20many%20of%20them%20end%20up%20as%20get%20params%20with%20each%20letter%20of%20the%20string%20a%20different%20param.%20%20You'll%20see%20in%20the%20second%20photo%20%221=d&2=a&3=b....%20because%20it's%20doing%20a%20for-each%20for%20the%20string.%20%20Perhaps%20something%20got%20broken%20during%20the%20recent%20?filter=*%20url%20changes.' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
Since%20we%20verify%20inherited%20properties%20anyway,%20we%20should%20replace%20the%20weird%20constructor-based%20%22function%22%20callback%20with%20a%20simple%20return%20false%20(relying%20solely%20on%20the%20strict%20equality%20check%20in%20innerEquiv).' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
  force_unicode(url))


***** [2019-07-06 13:37:15.272010] START repo#mocha" *****
|-- Total records:  2356
|-- Save data to "temp/repo_issue_summary_mocha.csv"...
|-- Save data to "temp/repo_issue_summary_mocha.xlsx"...



This%20works%20in%20Safari%205.1.3%20(OS%20X%2010.7.3)
It%20fails%20in%20Chrome%2017.0.9%20(OS%20X%2010.7.3)
Looks%20ugly%20in%20Firefox,%20but%20if%20I%20recall%20reading,%20Mocha%20only%20supports%20Webkit%20browsers.

Any%20thoughts%20on%20the%20Chrome/Safari%20discrepancy?' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
  force_unicode(url))
work%20at%203.1.0
/Users/tz/Workspaces/eggjs/aliyun-egg/node_modules/.3.1.1@mocha/lib/runnable.js:52
%20%20this.async%20=%20fn%20&&%20fn.length;
%20%20%20%20%20%20%20%20%20%20%20%20%20%5e

TypeError:%20Cannot%20set%20property%20async%20of%20' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))


***** [2019-07-06 13:37:20.002149] START repo#jest" *****
|-- Total records:  4530
|-- Save data to "temp/repo_issue_summary_jest.csv"...
|-- Save data to "temp/repo_issue_summary_jest.xlsx"...


  force_unicode(url))
the%20error%20returned%20from%20jest%20is

Error:%20Invariant%20Violation:%20The%20style-loader%20must%20be%20configured%20with%20reference-counted%20API.
at%20invariant%20(../node_modules/fbjs/lib/invariant.js:39:15)
at%20WithStyles.componentWillMount%20(decorators/withStyles.js:86:47)

This%20is%20a%20build%20based%20on%20react-starter-kit.' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
jest-bug-1
The%20setup:

We%20have%20two%20%22libraries%22%20called%20'a'%20and%20'b'.%20Each%20imports%20a%20corresponding%20%22helper%22.
Then%20we%20run%20jest
test.js%20will%20mock%20'a'%20and%20unmock%20'b'.%20It%20will%20later%20use%20'b',%20expecting%20to%20see%20the%20unmocked%20version%20of%20b's%20helper.

What%20happens

Mocking%20'a'%20will%20store%20'./helper'%20in%20Jest's%20internal%20shouldMockModuleCache.
when%20we%20later%20come%20to%20use%20'b',%20Jest%20will%20discover%20that%20'./helper'%20is%20alr

***** [2019-07-06 13:37:28.377727] START repo#jasmine" *****
|-- Total records:  1198
|-- Save data to "temp/repo_issue_summary_jasmine.csv"...
|-- Save data to "temp/repo_issue_summary_jasmine.xlsx"...


If%20using%20Jasmine%201.3.1,%20I%20use
describe(%22TryTry%22,%20function()%20%7b

%20%20%20%20var%20i;

%20%20%20%20function%20checkForSituation(a)%20%7b
%20%20%20%20%20%20%20%20//%20say,%20if%20this%20is%20made%20into%20a%20function%20because%20
%20%20%20%20%20%20%20%20//%20%20%20there%20are%20a%20lot%20of%20processing

%20%20%20%20%20%20%20%20console.log(%22THERE%22,%20a);%20
%20%20%20%20%20%20%20%20expect(foo(3,%20a)).toEqual(%203%20+%20a%20);
%20%20%20%20%7d

%20%20%20%20for%20(i%20=%200;%20i%20%3c%205;%20i++)%20%7b
%20%20%20%20%20%20%20%20console.log(%22HERE%22,%20i);%20

%20%20%20%20%20%20%20%20it(%22should%20add%20for%20%22%20+%20i,%20function()%20%7b

%20%20%20%20%20%20%20%20%20%20%20%20checkForSituation(i);

%20%20%20%20%20%20%20%20%7d);

%20%20%20%20%7d

%7d);

and%20foo%20is%20just:
function%20foo(a,%20b)%20%7b
%20%20%20%20return%20a%20+%20b;
%7d

I%20would%20expect%20it%20to%20check%20for%200%20to%204,%20and%20print%20out
HERE%200
THERE%200
HERE%201
THERE%201
%20%20...

bu

***** [2019-07-06 13:37:30.018581] START repo#funcunit" *****
|-- Total records:  154
|-- Save data to "temp/repo_issue_summary_funcunit.csv"...
|-- Save data to "temp/repo_issue_summary_funcunit.xlsx"...
***** [2019-07-06 13:37:30.258316] START repo#puppeteer" *****
|-- Total records:  3067
|-- Save data to "temp/repo_issue_summary_puppeteer.csv"...
|-- Save data to "temp/repo_issue_summary_puppeteer.xlsx"...


  force_unicode(url))
Please%20can%20we%20have%20a%20version%20of%20puppeteer%20which%20uses%20the%20new%20Chromium?%20This%20will%20hopefully%20solve%20so%20many%20headaches%20with%20running%20puppeteer%20inside%20Docker%20containers.' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
However,%20when%20I%20attempt%20to%20call%20select()%20with%20Puppeteer%20on%20an%20element%20with%20no%20value%20attribute,%20it%20is%20unable%20to%20select%20it%20based%20upon%20the%20value%20contained%20within%20the%20text%20node.
Steps%20to%20reproduce

Puppeteer%20version:%20puppeteer@1.14.0
Platform%20/%20OS%20version:%20Arch%20Linux
URLs%20(if%20applicable):%20n/app%20(internal/confidential%20URLs)
Node.js%20version:%20v11.13.0

What%20steps%20will%20reproduce%20the%20problem?
HTML
%3cselect%20id=%22example%22%3e
%20%20%3coption%3eApples%3c/option%3e
%20%20%3coption%3eBananas%3c/option%3e
%20%20%3coption%3eCoconuts%3c/option%3e
%3c/select%3

***** [2019-07-06 13:37:34.584714] START repo#cypress" *****
|-- Total records:  3541
|-- Save data to "temp/repo_issue_summary_cypress.csv"...
|-- Save data to "temp/repo_issue_summary_cypress.xlsx"...


In [410]:
df_cur.head()

Unnamed: 0,issue_id,title,contents,authorLogin,authorAssociation,createdAt,closed,closedAt,closedDuration_days,milestone,...,label_topic: cy.wait() timeout,label_OS: FreeBSD,label_external: dashboard,label_pkg/driver,label_topic: plugins ⚙️,label_topic: visibility 👁,label_topic: drag-and-drop,label_external: documentation,label_topic: actionability,label_topic: aw snap :slightly_frowning_face:
0,1,cy.contains() breaks with single quotes,When passing in a string containing a single q...,jennifer-shehane,MEMBER,2015-03-27 15:48:51+00:00,True,2015-03-27 23:20:21+00:00,0.31,,...,False,False,False,False,False,False,False,False,False,False
1,2,Integration with Travis CI,Would love support and documentation for autom...,lorennorman,CONTRIBUTOR,2015-03-27 16:42:26+00:00,True,2015-09-14 04:53:51+00:00,170.51,,...,False,False,False,False,False,False,False,False,False,False
2,3,cy.visit() will insert a trailing slash which ...,Example:\nhttp://localhost:5000/#/map?/,brandon-beacher,NONE,2015-03-27 16:44:19+00:00,True,2015-03-27 23:21:03+00:00,0.28,,...,False,False,False,False,False,False,False,False,False,False
3,4,When doing 2 requests to the same cy.route ali...,"cy.route(/accounts/, {}).as(""accountsGet"")\n\n...",jennifer-shehane,MEMBER,2015-03-27 17:31:34+00:00,True,2015-04-27 04:13:20+00:00,30.45,,...,False,False,False,False,False,False,False,False,False,False
4,5,allow for aliasing of things beside DOM elements,I'd like to be able to alias things other than...,jennifer-shehane,MEMBER,2015-03-27 19:06:23+00:00,True,2015-04-24 16:16:35+00:00,27.88,,...,False,False,False,False,False,False,False,False,False,False
