# SIESTA 2019, IJM, Predicting Bug Fixing Commits

In [1]:
from sqlalchemy import create_engine
import pandas as pd

In [2]:
%load_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autopandas=True

In [3]:
pd.set_option('display.max_colwidth', 100)

In [4]:
%sql postgresql://mpinzger@localhost:5432/retrofit

'Connected: mpinzger@retrofit'

## Get bug-fixing commits

In [59]:
%%sql bug_fixing_revisions << SELECT f.*, co.commit_dst, co.commit_msg, co.timestamp 
FROM change_schema.filerevision f JOIN change_schema.commit co ON f.revision_id = co.id 
WHERE co.commit_msg ~* '[[:<:]]bugs?[[:>:]]' 
OR co.commit_msg ~* '[[:<:]]issues?[[:>:]]'
OR co.commit_msg ~* '[[:<:]]fix(ed)?[[:>:]]'
OR co.commit_msg ~* '#\d+'
ORDER by co.timestamp;

 * postgresql://mpinzger@localhost:5432/retrofit
134 rows affected.
Returning data to local variable bug_fixing_revisions


In [241]:
#bug_fixing_revisions.head(3)

## Get counts per filerevision and action type

In [87]:
%%sql changes_per_action << SELECT co.timestamp, f.id, f.filename, c.action, count(c.id) count_changes
FROM change_schema.changes c JOIN change_schema.filerevision f ON c.filerevision_id = f.id 
JOIN change_schema.commit co ON f.revision_id = co.id 
GROUP BY co.timestamp, f.id, f.filename, c.action
ORDER by co.timestamp, f.id, c.action;

 * postgresql://mpinzger@localhost:5432/retrofit
3583 rows affected.
Returning data to local variable changes_per_action


In [243]:
#changes_per_action.tail(3)

### Add buggy flag

In [92]:
changes_per_action['buggy'] = changes_per_action.id.isin(bug_fixing_revisions.id)

In [103]:
1009 in bug_fixing_revisions.id.unique()

False

In [242]:
changes_per_action.head(3)

Unnamed: 0,timestamp,id,filename,action,count_changes,buggy
0,2012-03-26 15:38:09,95,android/src/main/java/retrofit/android/ShakeDetector.java,INS,472,False
1,2012-03-26 15:38:09,96,android/src/main/java/retrofit/io/QueueFile.java,INS,1487,False
2,2012-03-26 15:38:09,440,io/src/main/java/retrofit/io/Files.java,INS,195,False


## Create dataset with counts per action type as separata columns

In [229]:
changes_per_action_idx = changes_per_action.set_index('id')

In [230]:
counts = pd.DataFrame({'id': changes_per_action_idx.index.unique()})
counts.set_index('id', inplace=True)

for action_type in ['INS', 'DEL', 'MOV', 'UPD']:
    c = changes_per_action_idx[changes_per_action_idx.action == action_type]
    c = c.count_changes
    c.rename(action_type, inplace=True)
    counts = counts.merge(c, left_index=True, right_index=True, how='left')
    counts.fillna(0.0, inplace=True)
#    print(list(counts.columns.values))

### Add revision information 

In [231]:
unique_revisions = changes_per_action.groupby(['id','filename','timestamp', 'buggy'], as_index=False).agg({'action' : 'count', 'count_changes' : 'sum'})
#unique_revisions = changes_per_action.groupby(['id']).agg({'action' : 'count'})

In [232]:
unique_revisions.head(3)

Unnamed: 0,id,filename,timestamp,buggy,action,count_changes
0,1,retrofit-converters/moshi/src/main/java/retrofit/MoshiConverterFactory.java,2015-11-24 12:54:38,False,1,4
1,2,retrofit-converters/moshi/src/main/java/retrofit2/converter/moshi/MoshiConverterFactory.java,2016-09-01 19:52:20,False,3,32
2,3,retrofit-converters/moshi/src/main/java/retrofit2/converter/moshi/MoshiRequestBodyConverter.java,2016-09-01 19:52:20,False,4,25


In [233]:
unique_revisions.set_index('id', inplace=True)

In [235]:
counts = counts.merge(unique_revisions, left_index=True, right_index=True, how='left')

In [244]:
counts.sort_values('timestamp', inplace=True)

In [245]:
counts.head(5)

Unnamed: 0_level_0,INS,DEL,MOV,UPD,filename,timestamp,buggy,action,count_changes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
95,472.0,0.0,0.0,0.0,android/src/main/java/retrofit/android/ShakeDetector.java,2012-03-26 15:38:09,False,1,472
1827,110.0,0.0,0.0,0.0,http/src/main/java/retrofit/http/Server.java,2012-03-26 15:38:09,False,1,110
1828,74.0,0.0,0.0,0.0,http/src/main/java/retrofit/http/TypedBytesBody.java,2012-03-26 15:38:09,False,1,74
1829,9.0,0.0,0.0,0.0,http/src/main/java/retrofit/http/SingleEntity.java,2012-03-26 15:38:09,False,1,9
1830,158.0,0.0,0.0,0.0,http/src/main/java/retrofit/http/UiCallback.java,2012-03-26 15:38:09,False,1,158


## Do a random forest classification

In [238]:
from sklearn.ensemble import RandomForestClassifier

In [239]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

In [250]:
#clf.fit(counts.INS, counts.buggy)