# Retrieving Detailed Commit Info
This notebook explores `pydriller` and the information that it mines from GitHub repositories, particularly when it comes to commits.

# Imports

In [7]:
from networkxgmml import XGMMLReader
import pandas as pd
from pydriller import RepositoryMining

# Load the graph

In [3]:
graph_filename = "data/12_02_graph.xgmml"

with open(graph_filename, "rb") as graph_file:
     G = XGMMLReader(graph_file)

# Create a dataframe of just the commits

In [6]:
commits_df=pd.concat([pd.DataFrame([[u]+list(G.node[u].values())], columns=["commit id"]+list(G.node[u].keys())) for u in G.nodes() if "commits" in u], 
                     ignore_index=True).set_index("commit id")
commits_df.head()

Unnamed: 0_level_0,label,name,owner,repo_id,url,additions,author_key,closed_date,created_date,deletions,...,operation,body,published_date,tag,username,comment_key,content,date,hash,message
commit id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
commits/698ac82a-63f8-5c16-80ec-f533f9458123,Default-Label,master,kelektiv,d8433c8d-b38d-4b30-b4f6-6a3eb0b65f93,https://github.com/kelektiv/node-cron,1,,2019-11-22T02:19:17+00:00,2016-12-31T22:40:41+00:00,1,...,MODIFY,@ncb000gt removed in commit `98dad71`,2017-02-19T17:51:10+00:00,v1.0.2,woyuen,,1,2010-04-13T05:32:04+00:00,0b3e87179cd862a8b2432ccce65be80dd52a2aeb,Init. Not working. Signed-off-by: Nick Campbe...
commits/02b0d7f6-17dc-5592-8b8b-43161a93185a,Default-Label,master,kelektiv,d8433c8d-b38d-4b30-b4f6-6a3eb0b65f93,https://github.com/kelektiv/node-cron,1,,2019-11-22T02:19:17+00:00,2016-12-31T22:40:41+00:00,1,...,MODIFY,@ncb000gt removed in commit `98dad71`,2017-02-19T17:51:10+00:00,v1.0.2,woyuen,,1,2010-04-15T05:17:39+00:00,fff5a24c47df08bd0edf47a26c2ca4ee18a14594,Added functional bcrypt algo with node hooks. ...
commits/7c0f6f9f-988c-50a6-b390-77382f6ce375,Default-Label,master,kelektiv,d8433c8d-b38d-4b30-b4f6-6a3eb0b65f93,https://github.com/kelektiv/node-cron,1,,2019-11-22T02:19:17+00:00,2016-12-31T22:40:41+00:00,1,...,MODIFY,@ncb000gt removed in commit `98dad71`,2017-02-19T17:51:10+00:00,v1.0.2,woyuen,,1,2010-04-15T05:18:16+00:00,fcc1760e6b79a6369200fb37685122e503f5298a,"Removed bCrypt.js, not using it anymore. Moved..."
commits/63a3e10b-d557-54a9-bbea-4c0529776ca9,Default-Label,master,kelektiv,d8433c8d-b38d-4b30-b4f6-6a3eb0b65f93,https://github.com/kelektiv/node-cron,1,,2019-11-22T02:19:17+00:00,2016-12-31T22:40:41+00:00,1,...,MODIFY,@ncb000gt removed in commit `98dad71`,2017-02-19T17:51:10+00:00,v1.0.2,woyuen,,1,2010-04-15T05:22:27+00:00,bc9125a0b93a157cdb24a01139e0a42c76a72b86,"Line breaks, and license split out. Signed-of..."
commits/040dbf01-e8cf-5782-b62d-b06bd7ceb4c0,Default-Label,master,kelektiv,d8433c8d-b38d-4b30-b4f6-6a3eb0b65f93,https://github.com/kelektiv/node-cron,1,,2019-11-22T02:19:17+00:00,2016-12-31T22:40:41+00:00,1,...,MODIFY,@ncb000gt removed in commit `98dad71`,2017-02-19T17:51:10+00:00,v1.0.2,woyuen,,1,2010-04-15T05:25:26+00:00,a58323d73ce39066be3f6e4b25298ab78082c8e0,Links in the markdown Signed-off-by: Nick Cam...


In [72]:
commits_df.groupby("url").size()

url
https://github.com/kelektiv/node-cron    789
dtype: int64

In [67]:
len(commits_df)

789

In [68]:
len(commits)

411

# Mine the commits
These will be saved into a dictionary that maps the commit hash to the mined Commit object

In [8]:
commits = {commit.hash: commit for commit in RepositoryMining('https://github.com/kelektiv/node-cron').traverse_commits()}

In [9]:
commits_df.iloc[0].hash

'0b3e87179cd862a8b2432ccce65be80dd52a2aeb'

In [66]:
commits[commits_df.iloc[0].hash]

KeyError: '0b3e87179cd862a8b2432ccce65be80dd52a2aeb'

##### Ignore this for now, let's get a commit hash that exists in both datasets

In [23]:
shared_hash = list(set(commits.keys()) & set(commits_df["hash"]))[0]
shared_hash

'cb2a771ed5c2828e15f40ab32d59a163ee249e22'

In [24]:
commits[shared_hash]

<pydriller.domain.commit.Commit at 0x7f71ad287048>

Documentation for [Commit](https://pydriller.readthedocs.io/en/latest/reference.html#pydriller.domain.commit.Commit)

In [27]:
[attr for attr in dir(commits[shared_hash]) if not attr.startswith("_")]

['author',
 'author_date',
 'author_timezone',
 'branches',
 'committer',
 'committer_date',
 'committer_timezone',
 'hash',
 'in_main_branch',
 'merge',
 'modifications',
 'msg',
 'parents',
 'project_name',
 'project_path']

In [36]:
commits[shared_hash].hash

'cb2a771ed5c2828e15f40ab32d59a163ee249e22'

In [29]:
commits[shared_hash].author

<pydriller.domain.developer.Developer at 0x7f71ad18f9b0>

In [32]:
commits[shared_hash].msg

'added time function'

# Mine the Modifications
Each commit may have a list of Modifications, which detail the files changed within the commit

In [30]:
modifications = {commit.hash: commit.modifications for commit in RepositoryMining('https://github.com/kelektiv/node-cron').traverse_commits()}

In [31]:
modifications[shared_hash]

[<pydriller.domain.commit.Modification at 0x7f71acf85ef0>,
 <pydriller.domain.commit.Modification at 0x7f71acf85e48>]

Documentation for [Modification](https://pydriller.readthedocs.io/en/latest/modifications.html)

In [34]:
[attr for attr in dir(modifications[shared_hash][0]) if not attr.startswith("_")]

['added',
 'change_type',
 'complexity',
 'diff',
 'filename',
 'methods',
 'new_path',
 'nloc',
 'old_path',
 'removed',
 'source_code',
 'source_code_before',
 'token_count']

In [37]:
[modification.filename for modification in modifications[shared_hash]]

['cron.js', 'package.json']

In [38]:
[modification.change_type for modification in modifications[shared_hash]]

[<ModificationType.MODIFY: 5>, <ModificationType.MODIFY: 5>]

In [39]:
[modification.complexity for modification in modifications[shared_hash]]

[43, 0]

In [64]:
print(modifications[shared_hash][0].diff)

@@ -312,14 +312,19 @@ exports.job = function(cronTime, onComplete)
   return new CronJob(cronTime, onComplete);
 }
 
+exports.time = function(cronTime)
+{
+    return new CronTime(cronTime);
+}
+
 exports.sendAt = function(cronTime)
 {
-    return new CronTime(cronTime).sendAt();
+    return exports.time(cronTime).sendAt();
 }
 
 exports.timeout = function(cronTime)
 {
-    return new CronTime(cronTime).timeout();
+    return exports.time(cronTime).timeout();
 }
 
 



In [42]:
[modification.nloc for modification in modifications[shared_hash]]

[183, 37]

In [43]:
[modification.added for modification in modifications[shared_hash]]

[7, 1]

In [44]:
[modification.removed for modification in modifications[shared_hash]]

[2, 1]

In [45]:
print(modifications[shared_hash][0].source_code)

/**
 * cron.js
 * ---
 * VERSION 0.1
 * ---
 * @author James Padolsey
 * ---
 * Dual licensed under the MIT and GPL licenses.
 *    - http://www.opensource.org/licenses/mit-license.php
 *    - http://www.gnu.org/copyleft/gpl.html
 */


function CronTime(time) {

  this.source = time;

  this.second     = {};
  this.minute     = {};
  this.hour       = {};
  this.dayOfWeek  = {};
  this.dayOfMonth = {};
  this.month      = {};

  this._parse();

};

CronTime.map = ['second', 'minute', 'hour', 'dayOfMonth', 'month', 'dayOfWeek'];
CronTime.constraints = [ [0, 59], [0, 59], [0, 23], [1, 31], [0, 11], [1, 7] ];
CronTime.aliases = {
    jan:0, feb:1, mar:2, apr:3, may:4, jun:5, jul:6, aug:7, sep:8, oct:9, nov:10, dec:11,
    sun:1, mon:2, tue:3, wed:4, thu:5, fri:6, sat:7
};


CronTime.prototype = {

  /**
   * calculates the next send time
   */

  sendAt: function(start) {

    var date = start ? start : new Date();

    //add 1 second so next time isn't now (can cause timeout to be 0)
   

In [46]:
modifications[shared_hash][0].methods

[<pydriller.domain.commit.Method at 0x7f71acf980f0>,
 <pydriller.domain.commit.Method at 0x7f71acf98b70>,
 <pydriller.domain.commit.Method at 0x7f71acf98ef0>,
 <pydriller.domain.commit.Method at 0x7f71acf98160>,
 <pydriller.domain.commit.Method at 0x7f71acf98d68>,
 <pydriller.domain.commit.Method at 0x7f71acf98438>,
 <pydriller.domain.commit.Method at 0x7f71acf98208>,
 <pydriller.domain.commit.Method at 0x7f71acf98a90>,
 <pydriller.domain.commit.Method at 0x7f71acf98f98>,
 <pydriller.domain.commit.Method at 0x7f71acf98860>,
 <pydriller.domain.commit.Method at 0x7f71ad73abe0>,
 <pydriller.domain.commit.Method at 0x7f71ad73a438>,
 <pydriller.domain.commit.Method at 0x7f71ad73aa90>,
 <pydriller.domain.commit.Method at 0x7f71ad73a940>,
 <pydriller.domain.commit.Method at 0x7f71ad73acc0>,
 <pydriller.domain.commit.Method at 0x7f71ad73aeb8>,
 <pydriller.domain.commit.Method at 0x7f71ad73aa20>,
 <pydriller.domain.commit.Method at 0x7f71ad73ae10>,
 <pydriller.domain.commit.Method at 0x7f71ad73

In [47]:
[attr for attr in dir(modifications[shared_hash][0].methods[0]) if not attr.startswith("_")]

['complexity',
 'end_line',
 'fan_in',
 'fan_out',
 'filename',
 'general_fan_out',
 'length',
 'long_name',
 'name',
 'nloc',
 'parameters',
 'start_line',
 'token_count',
 'top_nesting_level']

In [48]:
[method.complexity for method in modifications[shared_hash][0].methods]

[1, 9, 1, 1, 1, 3, 3, 2, 3, 4, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1]

In [65]:
[(method.filename, method.long_name, method.name, method.parameters) for method in modifications[shared_hash][0].methods]

[('cron.js', 'CronTime ( time )', 'CronTime', ['time']),
 ('cron.js', 'sendAt ( start )', 'sendAt', ['start']),
 ('cron.js', 'timeout ( )', 'timeout', []),
 ('cron.js', 'toString ( )', 'toString', []),
 ('cron.js', 'toJSON ( )', 'toJSON', []),
 ('cron.js', "'_wcOrAll' ( type )", "'_wcOrAll'", ['type']),
 ('cron.js', "'_hasAll' ( type )", "'_hasAll'", ['type']),
 ('cron.js', 'function ( alias )', 'function', ['alias']),
 ('cron.js', '_parse ( )', '_parse', []),
 ('cron.js',
  'function ( $0 , lower , upper , step )',
  'function',
  ['$0', 'lower', 'upper', 'step']),
 ('cron.js',
  '_parseField ( field , type , constraints )',
  '_parseField',
  ['field', 'type', 'constraints']),
 ('cron.js',
  'CronJob ( cronTime , onComplete )',
  'CronJob',
  ['cronTime', 'onComplete']),
 ('cron.js', 'addCallback ( callback )', 'addCallback', ['callback']),
 ('cron.js', '_callback ( )', '_callback', []),
 ('cron.js', 'function ( self )', 'function', ['self']),
 ('cron.js', 'start ( )', 'start', []),


In [53]:
[(method.start_line, method.end_line) for method in modifications[shared_hash][0].methods]

[(14, 27),
 (43, 98),
 (104, 107),
 (113, 121),
 (126, 129),
 (135, 146),
 (151, 162),
 (171, 181),
 (168, 190),
 (208, 226),
 (195, 234),
 (239, 247),
 (255, 259),
 (264, 271),
 (283, 294),
 (277, 296),
 (302, 305),
 (310, 313),
 (315, 318),
 (320, 323),
 (325, 328)]

In [55]:
[(method.fan_in, method.fan_out) for method in modifications[shared_hash][0].methods]

[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0)]

(I have no idea what fan in and fan out is ¯\\\_(ツ)_/¯)

In [52]:
[method.nloc for method in modifications[shared_hash][0].methods]

[10, 43, 4, 9, 4, 10, 9, 7, 10, 10, 13, 6, 3, 5, 6, 5, 4, 4, 4, 4, 4]

In [58]:
[method.token_count for method in modifications[shared_hash][0].methods]

[59,
 304,
 28,
 54,
 11,
 50,
 66,
 36,
 86,
 84,
 89,
 37,
 21,
 33,
 23,
 38,
 11,
 16,
 12,
 17,
 17]

In [59]:
[method.top_nesting_level for method in modifications[shared_hash][0].methods]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]