In [1]:
#Some global variables
uri = None
MAX_MONTHS = 10
NTHREADS = 20
NPROJECTS = 1
project_names = ['accumulo', 'ant-ivy', 'archiva', 'aurora', 'calcite', 'cayenne', 'commons-bcel', 'commons-beanutils', 'commons-codec', 'commons-collections', 'commons-compress', 'commons-configuration', 'commons-dbcp', 'commons-digester', 'commons-imaging', 'commons-io', 'commons-jcs', 'commons-jexl', 'commons-lang', 'commons-math', 'commons-net', 'commons-rdf', 'commons-scxml', 'commons-validator', 'commons-vfs', 'deltaspike', 'falcon', 'flume', 'giraph', 'kafka', 'knox', 'kylin', 'lens', 'mahout', 'nifi', 'nutch', 'opennlp', 'parquet-mr', 'pdfbox', 'pig', 'storm', 'struts', 'systemml', 'tez', 'tika', 'wss4j']

In [2]:
import time
import sys
import pandas as pd
import os.path
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from dateutil.rrule import rrule, MONTHLY
from shutil import rmtree
from sklearn.model_selection import train_test_split

try:
    from mongoengine import connect
    from pycoshark.mongomodels import Commit, IssueComment, Issue, Message, People, VCSSystem, Project, MailingList, IssueSystem
    from pycoshark.utils import create_mongodb_uri_string
except ImportError:
    get_ipython().system('{sys.executable} -m pip install pycoshark')
    from mongoengine import connect
    from pycoshark.mongomodels import Commit, IssueComment, Issue, Message, People, VCSSystem, Project, MailingList, IssueSystem
    from pycoshark.utils import create_mongodb_uri_string

def undersample(df):
    positive = 0
    negative = 0
    for index, row in df.iterrows():
        if row['comQ'] > 0:
            positive += 1
        else:
            negative += 1

    if positive != negative:
        print('Class imbalance:',positive,negative)
        random_indices = None
        under_sample_indices = None
        positive_idx = df[df.comQ >= 0].index
        negative_idx = df[df.comQ < 1].index
        if positive > negative:
            random_indices = np.random.choice(positive_idx,negative, replace=False)
            under_sample_indices = np.concatenate([negative_idx,random_indices])
        if negative > positive:
            random_indices = np.random.choice(negative_idx,positive, replace=False)
            under_sample_indices = np.concatenate([positive_idx,random_indices])
            
        return df.loc[under_sample_indices]
    else:
        return df
    


def initialize(project_name):
    global uri
    # Create project folders
    base = 'csv/'+str(project_name)
    
    if not os.path.exists(base):
        os.mkdir(base)
    
    if not os.path.exists(base+'/raw'):
        os.mkdir(base+'/raw')
    
    if not os.path.exists(base+'/raw/com'):
        os.mkdir(base+'/raw/com')
    
    if not os.path.exists(base+'/raw/acom'):
        os.mkdir(base+'/raw/acom')
    
    if not os.path.exists(base+'/raw/msg'):
        os.mkdir(base+'/raw/msg')
        
    if not os.path.exists(base+'/train'):
        os.mkdir(base+'/train')

    if not os.path.exists(base+'/test_sets'):
        os.mkdir(base+'/test_sets')

    if uri is None:
        # Database credentials
        user = 'datascience2018'
        password = 'qFztn73TwV'
        host = '134.76.81.151'
        port = '27017'
        authentication_db = 'smartshark_test'
        database = "smartshark_test"
        ssl_enabled = None

        # Establish connection
        uri = create_mongodb_uri_string(
            user, password, host, port, authentication_db, ssl_enabled)
        connect(database, host=uri)

    # Fetch project id and version control system id for the project
    project = Project.objects(name=project_name).only('id').get()
    vcs_system = VCSSystem.objects(project_id=project.id).only('id').get()

    # Fetch all unique commiter IDs for the project
    committers = []
    for c in Commit.objects(vcs_system_id=vcs_system.id).only('committer_id'):
        if c.committer_id not in committers:
            committers.append(c.committer_id)

    return committers

# That's it
def getQuarter(month):
    quarter = (month+2) // 3
    season = []
    if quarter == 1:
        season = [1,0,0,0]
    if quarter == 2:
        season = [0,1,0,0]
    if quarter == 3:
        season = [0,0,1,0]
    if quarter == 4:
        season = [0,0,0,1]
    return season, quarter

# Generate empty rows
def getFiller(row, total_month_offset):
    rows = []
    prevQ = row['quarter']
    prevY = row['year']
    quarter_offset = 0
    year_offset = 0
    for month_offset in range(1,total_month_offset):
        month = row['month']+month_offset
        year = row['year']+year_offset
        
        if (month % 12) == 0:
            month = 12
            year += 1
        else:
            month = month % 12
        
        season, quarter = getQuarter(month)

        if year > prevY:
            year_offset += 1
            prevY = year

        if (quarter > prevQ) or (quarter == 1 and prevQ == 4): 
            quarter_offset += 1
            prevQ = quarter

        if quarter_offset > 3:
            quarter_offset = 3

        if month_offset > 5:
            month_offset = 5

        shiftedComM = [*np.zeros(month_offset)]+[*row.iloc[7:(12-month_offset)]]
        shiftedAComM = [*np.zeros(month_offset)]+[*row.iloc[15:(20-month_offset)]]
        shiftedMsgM = [*np.zeros(month_offset)]+[*row.iloc[23:(28-month_offset)]]

        shiftedComQ = [*np.zeros(quarter_offset)]+[*row.iloc[12:(15-quarter_offset)]]
        shiftedAComQ = [*np.zeros(quarter_offset)]+[*row.iloc[20:(23-quarter_offset)]]
        shiftedMsgQ = [*np.zeros(quarter_offset)]+[*row.iloc[28:(31-quarter_offset)]]

        r = [year, quarter, month,*season,*shiftedComM,*shiftedComQ,*shiftedAComM,*shiftedAComQ,*shiftedMsgM,*shiftedMsgQ]
        rows.append(r)

    return rows

class Counter:
    # Initialize some project specific stuff
    def __init__(self, project_name):
        self.project_name = project_name
        self.project_id = Project.objects(name=project_name).only('id').get().id
        self.vcs_system_id = VCSSystem.objects(project_id=self.project_id).only('id').get().id
        rawdate = VCSSystem.objects(project_id=self.project_id).only('last_updated').get().last_updated
        modM = rawdate.month % 3
        if modM == 0:
            self.today = datetime.date(rawdate.year,rawdate.month-2,1)
        if modM == 1:
            self.today = datetime.date(rawdate.year,rawdate.month,1)
        if modM == 2:
            self.today = datetime.date(rawdate.year,rawdate.month-1,1)
        print('Project date set to:',self.today)
        
        self.mailing_list_ids = []
        for mid in MailingList.objects(project_id=self.project_id).only('id'):
            self.mailing_list_ids.append(mid.id)

    def fillHoles(self,df,pid):
        project_date = self.today 
        prev_row = df.iloc[0]
        prev_date = datetime.date(prev_row['year'],prev_row['month'],1)
        ret = pd.DataFrame([prev_row],columns=df.columns)
        if prev_date > project_date:
            print('First commit from the future for',pid)
            ret = pd.DataFrame([prev_row],columns=df.columns)
        else:
            if df.shape[0] == 1:
                if prev_date == project_date:
                    ret = pd.DataFrame([prev_row],columns=df.columns)
                else:
                    total_month_offset = rrule(MONTHLY, dtstart=prev_date, until=project_date).count()
                    filler = getFiller(prev_row,total_month_offset)
                    ret = pd.DataFrame(filler,columns=df.columns)
            else:
                #They're sorted already. Skip the 1st row
                for index, row in df.iloc[1:].iterrows():
                    prev_date = datetime.date(prev_row['year'],prev_row['month'],1)
                    now_date = datetime.date(row['year'],row['month'],1)
                    #Check if it's the last row but still not filled
                    if (index+1 == len(df)) and (now_date < project_date):
                        total_month_offset = rrule(MONTHLY, dtstart=now_date, until=project_date).count()
                        print('Hole between',now_date,'and',project_date,'. Filling',total_month_offset,'rows to fill end')  


                        filler = getFiller(row,total_month_offset) 
                        filler_df = pd.DataFrame(filler, columns=df.columns)
                        ret = ret.append(filler_df, ignore_index=True)

                        prev_row = pd.DataFrame([filler[-1]], columns=df.columns)
                    #Check if there's a hole in the future
                    if now_date > project_date:
                        print('Discarding data from',now_date,'for id',pid)
                    else:
                        total_month_offset = rrule(MONTHLY, dtstart=prev_date, until=now_date).count()
                        #Check if it's the next one
                        if total_month_offset == 2:
                            row_df = pd.DataFrame([row],columns=df.columns)
                            ret = ret.append(row_df, ignore_index=True)
                            prev_row = row
                        #Check if there's a hole in the past
                        elif total_month_offset > 2:
                            print('Hole between',prev_date,'and',now_date,'. Filling',total_month_offset,'rows')                        

                            filler = getFiller(row, total_month_offset)
                            filler_df = pd.DataFrame(filler, columns=df.columns)
                            ret = ret.append(filler_df, ignore_index=True)

                            row_df = pd.DataFrame([row],columns=df.columns)
                            ret = ret.append(row_df, ignore_index=True)

                            prev_row = row
                    
        #Should aready be sorted tho
        ret = ret.sort_values(by=['year','month'])

        if ret.tail(1).reset_index(drop=True).at[0,'month'] == 12:
            ret.tail(2).head(1).to_csv('csv/'+str(self.project_name)+'/test_sets/'+str(pid)+'.csv')
            if ret.shape[0] > 1:
                ret[:-2].to_csv('csv/'+str(self.project_name)+'/train/'+str(pid)+'.csv')
        else:
            ret.tail(1).to_csv('csv/'+str(self.project_name)+'/test_sets/'+str(pid)+'.csv')
            if ret.shape[0] > 1:
                ret[:-1].to_csv('csv/'+str(self.project_name)+'/train/'+str(pid)+'.csv')
        
    # Count commits
    def commits(self, pid):
        path = 'csv/'+self.project_name+'/raw/com/'
        fname = path+str(pid)+'.csv'
        if not os.path.isfile(fname):
            columns=['year','quarter','month','S1','S2','S3','S4','comM','comM1','comM2','comM3','comM4','comQ','comQ1','comQ2']
            dates = []
            count = []
            print('Downloading commits for',pid,'...')
            for c in Commit.objects(vcs_system_id=self.vcs_system_id, committer_id=pid).only('committer_date'):
                day = c.committer_date.date()
                if day not in dates:
                    dates.append(day)
                    count.append(1)
                else:
                    count[-1] += 1

            df = pd.DataFrame(count, index=pd.to_datetime(dates), columns=['count'])
            df = df.sort_index()
            df = self.formatDF(df, columns)
            df.to_csv(fname)
        else:
            df = pd.read_csv(fname)
            df = df.drop('Unnamed: 0', axis = 1)
        return df
    
    # Count authored commits
    def authored_commits(self, pid):
        path = 'csv/'+self.project_name+'/raw/acom/'
        fname = path+str(pid)+'.csv'
        if not os.path.isfile(fname):
            columns=['year','quarter','month','S1','S2','S3','S4','acomM','acomM1','acomM2','acomM3','acomM4','acomQ','acomQ1','acomQ2']
            dates = []
            count = []
            print('Downloading authored commits for',pid,'...')
            for c in Commit.objects(vcs_system_id=self.vcs_system_id, author_id=pid).only('author_date'):
                day = c.author_date.date()
                if day not in dates:
                    dates.append(day)
                    count.append(1)
                else:
                    count[-1] += 1

            df = pd.DataFrame(count, index=pd.to_datetime(dates), columns=['count'])
            df = df.sort_index()
            df = self.formatDF(df, columns)
            df.to_csv(fname)
        else:
            df = pd.read_csv(fname)
            df = df.drop('Unnamed: 0', axis = 1)
        return df
    
    # Count messages sent, recieved and cc'd in
    def messages(self, pid):
        path = 'csv/'+self.project_name+'/raw/msg/'
        fname = path+str(pid)+'.csv'
        if not os.path.isfile(fname):
            columns=['year','quarter','month','S1','S2','S3','S4','msgM','msgM1','msgM2','msgM3','msgM4','msgQ','msgQ1','msgQ2']
            dates = []
            count = []
            print('Downloading messages for ID',pid,'...')
            for mid in self.mailing_list_ids:
                for c in Message.objects(mailing_list_id=mid, from_id=pid).only('date'):
                    day = c.date.date()
                    if day not in dates:
                        dates.append(day)
                        count.append(1)
                    else:
                        count[-1] += 1

                for c in Message.objects(mailing_list_id=mid, to_ids__in=[pid]).only('date'):
                    day = c.date.date()
                    if day not in dates:
                        dates.append(day)
                        count.append(1)
                    else:
                        count[-1] += 1

                for c in Message.objects(mailing_list_id=mid, cc_ids__in=[pid]).only('date'):
                    day = c.date.date()
                    if day not in dates:
                        dates.append(day)
                        count.append(1)
                    else:
                        count[-1] += 1

            df = pd.DataFrame(count, index=pd.to_datetime(dates), columns=['count'])
            df = df.sort_index()
            df = self.formatDF(df, columns)
            df.to_csv(fname)
        else:
            df = pd.read_csv(fname)
            df = df.drop('Unnamed: 0', axis = 1)
        return df

    # Put the right values in the right boxes
    def formatDF(self, df, col):
        raw = []
        row = []
        Mcount = 0
        Qcount = 0
        dfq = df.resample('Q').sum()
        dfm = df.resample('M').sum()
        previousQ = None
        for m in dfm.itertuples():
            season, quarter = getQuarter(m[0].month)
            row = [m[0].year, quarter, m[0].month, *season]
            loc = dfm.index.get_loc(m[0])
                
            if Mcount == 0:
                t = m[1]
                t1 = 0
                t2 = 0
                t3 = 0
                t4 = 0
            elif Mcount == 1:
                t = m[1]
                t1 = dfm.iloc[loc-1, 0]
                t2 = 0
                t3 = 0
                t4 = 0
            elif Mcount == 2:
                t = m[1]
                t1 = dfm.iloc[loc-1, 0]
                t2 = dfm.iloc[loc-2, 0]
                t3 = 0
                t4 = 0
            elif Mcount == 3:
                t = m[1]
                t1 = dfm.iloc[loc-1, 0]
                t2 = dfm.iloc[loc-2, 0]
                t3 = dfm.iloc[loc-3, 0]
                t4 = 0
            else:
                t = m[1]
                t1 = dfm.iloc[loc-1, 0]
                t2 = dfm.iloc[loc-2, 0]
                t3 = dfm.iloc[loc-3, 0]
                t4 = dfm.iloc[loc-4, 0]
            Mcount += 1
            row += [t,t1,t2,t3,t4]
            
            if previousQ is None:
                previousQ = quarter
                
            if previousQ == 4 and quarter == 1:
                previousQ = 0
                
            if quarter > previousQ:
                Qcount += 1
                
            if Qcount == 0:
                q = dfq.iloc[0,0]
                q1 = 0
                q2 = 0
            elif Qcount == 1:
                q = dfq.iloc[1,0]
                q1 = dfq.iloc[0,0]
                q2 = 0
            else:
                q = dfq.iloc[Qcount,0]
                q1 = dfq.iloc[Qcount-1,0]
                q2 = dfq.iloc[Qcount-2,0]
            
            previousQ = quarter
                
            row += [q,q1,q2]
            raw.append(row)        
        
        return pd.DataFrame(raw, columns=col)

# Just a wrapper for Counter
class Downloader:
    def __init__(self, project_name):
        self.project_id = Project.objects(name=project_name).only('id').get().id
        self.project_name = project_name
        self.counter = Counter(self.project_name)

    def download_data(self, pid):
        raw_dir = 'csv/'+str(self.project_name)+'/raw/'
        train_dir = 'csv/'+str(self.project_name)+'/train/'
        fname = str(pid)+'.csv'
        
        if os.path.isfile(train_dir+fname):
            #print('Found csv for ID:',pid)
            df = pd.read_csv(train_dir+fname)
            df = df.drop('Unnamed: 0', axis = 1)
        else:
            msg = self.counter.messages(pid)
            com = self.counter.commits(pid)
            acom = self.counter.authored_commits(pid)
            df = pd.merge(com, acom, how='left',on=['year','quarter','month','S1','S2','S3','S4'])
            df = df.fillna(0)
            df = pd.merge(df, msg, how='left',on=['year','quarter','month','S1','S2','S3','S4'])
            df = df.fillna(0)
            df = df.sort_values(by=['year','month'])
            df = df.astype('int64')
            df.to_csv(raw_dir+fname)
            self.counter.fillHoles(df,pid)
            print('Finished downloading data for ID:',pid)
        
        return df

# Wrapper for the random forest
class Model:
    def __init__(self,features):
        labels = np.array(features['comQ'])

        features= features.drop('comM', axis = 1)
        features= features.drop('comQ', axis = 1)
        features= features.drop('acomM', axis = 1)
        features= features.drop('acomQ', axis = 1)
        features= features.drop('msgM', axis = 1)
        features= features.drop('msgQ', axis = 1)
        train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)
        self.train(train_features,train_labels)
        self.predict(test_features)
        self.evaluate(test_features,test_labels)

    def predict(self, features):
        predictions = self.model.predict(features)
        self.predictions = predictions
        return predictions

    def evaluate(self, features, labels):
        errors = abs(self.predictions - labels)
        print('Mean Absolute Error:', round(np.mean(errors), 2), 'commits.')
        
        mase = errors.mean()/(np.abs(np.diff(features)).sum()/(features.shape[0]-1))
        print('MASE:', mase)

        importances = self.model.feature_importances_
        indices = np.argsort(importances)
        plt.barh(range(len(indices)), importances[indices], color='b', align='center')
        plt.yticks(range(len(indices)), [self.feature_list[i] for i in indices])
        plt.xlabel('Relative Importance')
        plt.show()
    
    def train(self, features, labels):
        self.feature_list = list(features.columns)
        features = np.array(features)

        from sklearn.ensemble import RandomForestRegressor
        rf = RandomForestRegressor(n_estimators = 3000, random_state = 42)
        rf.fit(features, labels)
        self.model = rf

In [3]:
import sys
from multiprocessing.dummy import Pool as ThreadPool 

pool = ThreadPool(NTHREADS) 

def run(pid):
    global committers
    fname = 'csv/'+project_name+'/test_sets/'+str(pid)+'.csv'
    i = committers.index(pid)
    dfi = pd.read_csv(fname)
    dfi = dfi.drop('Unnamed: 0', axis = 1)
    columns = dfi.columns
    dfi= dfi.drop('comM', axis = 1)
    dfi= dfi.drop('comQ', axis = 1)
    dfi= dfi.drop('acomM', axis = 1)
    dfi= dfi.drop('acomQ', axis = 1)
    dfi= dfi.drop('msgM', axis = 1)
    dfi= dfi.drop('msgQ', axis = 1)
    pred = np.round(model.predict(dfi))
    if pred[0] >= 1:
        return [pred[0],1]
    else:
        return [pred[0],0]

In [4]:
down_t0 = time.time()
for project_name in project_names:
    committers = initialize(project_name)
    downloader = Downloader(project_name)

    results = pool.map(downloader.download_data, committers)
    df = None
    for x in results:
        if df is None:
            df = x
        else:
            df = df.append(x)
    print(project_name,'finished loading')

down_t1 = time.time()
print('\n Downloading phase took:',np.round(down_t1-down_t0),'seconds')

In [None]:
df = df.reset_index(drop=True)
df = undersample(df)
train_t0 = time.time()
model = Model(df)
train_t1 = time.time()
print('Training phase took:',np.round(train_t1-train_t0),'seconds')

In [None]:
print_rows = []
for project_name,i in zip(['pig'],range(NPROJECTS)):
    r = []
    committers = initialize(project_name)
    testing_df = None
    for pid in committers:
        dfa = pd.read_csv('csv/'+project_name+'/test_sets/'+str(pid)+'.csv')
        if testing_df is None:
            testing_df = dfa
        else:
            testing_df = testing_df.append(dfa)
    testing_df = testing_df.drop('Unnamed: 0', axis = 1)

    eval_t0 = time.time()  
    results = pool.map(run, committers)
    eval_t1 = time.time()
    res_df = pd.DataFrame(results, index=committers, columns=['commits','willCommit'])
    res_df.loc['Total',:]= res_df.sum(axis=0)
    print('Eval phase took:',np.round(eval_t1-eval_t0),'seconds')
    s,q = getQuarter(downloader.counter.today.month)
    print('PROJECT_NAME:',project_name)
    print('NEXT QUARTER:',q,'\n')
    print('How many different committers are expected in the next quarter:',res_df.loc['Total','willCommit'])
    print('How many commits are expected in the next quarter:',res_df.loc['Total','commits'])
    print('Which developers will still be commiting source code in the next quarter:\n',res_df.loc[res_df['willCommit'] == 1].index)
