In [1]:
import numpy as np
import pandas as pd
import math

## **DATA SELECTION**

First, we select the tables that contains the variables of interest

In [3]:
path1 = "../data/raw/TDD/GIT_COMMITS.csv"
path2 = "../data/raw/TDD/SONAR_ANALYSIS.csv"
path3 = "../data/raw/TDD/SONAR_MEASURES.csv"
db_commits = pd.read_csv(path1,lineterminator='\n')
db_analysis = pd.read_csv(path2)
db_measures = pd.read_csv(path3)

Secondly, for each table, we select those variables:

    · GIT_COMMITS: project id, commit hash, commit message, author and committer date
    · SONAR_ANALYSIS: revision and anlysis key
    · SONAR_MESURES: analysis key, complexity, violations and development cost 

In [4]:
db_commits = db_commits[["PROJECT_ID", "COMMIT_HASH", "COMMIT_MESSAGE", "AUTHOR",  "COMMITTER_DATE"]]
db_commits.head()

Unnamed: 0,PROJECT_ID,COMMIT_HASH,COMMIT_MESSAGE,AUTHOR,COMMITTER_DATE
0,org.apache:archiva,94fe3a7fc056638c90cbe4a6319c3cb658f395a5,create template structure git-svn-id: https:...,Brett Porter,2005-11-23 23:54:12+00:00
1,org.apache:archiva,2b6be811c1d4f5e81339616208530d486608e42b,repository manager - root POM git-svn-id: ht...,Brett Porter,2005-11-28 03:15:35+00:00
2,org.apache:archiva,af0ddbcfc1e8c2528decbd458f42e16cbcae5051,update reports git-svn-id: https://svn.apach...,Brett Porter,2005-11-28 04:50:31+00:00
3,org.apache:archiva,ea640e1803a8535ebfba2ade4ea9272b240067e8,move discovery to a new component PR: MRM-9 ...,Brett Porter,2005-11-29 01:34:22+00:00
4,org.apache:archiva,4b1232d671e7c0aa44927ce2d4574d8cb751497d,add discovery module PR: MRM-9 git-svn-id: h...,Brett Porter,2005-11-29 01:43:41+00:00


In [5]:
db_analysis = db_analysis[["REVISION", "ANALYSIS_KEY"]]
db_analysis.head()

Unnamed: 0,REVISION,ANALYSIS_KEY
0,b9988a83e364b9b470873dff8996dcf401d08dc4,AWedEXD3C4KKKThcCqHV
1,5d351ea375f2b8d4ca0ae40b887392915983121a,AWedCu_QC4KKKThcCqGv
2,2bb54cc863c9c599e316ae5da7ba1a77f42e9402,AWedBDbIC4KKKThcCqGA
3,68b851d2adbbeee6014d1c37b9c07fdb4fc6820f,AWec_b8nC4KKKThcCqFg
4,e6c057bd6a04ec286d4ac4da5d2a82696868b937,AWec904qC4KKKThcCqE3


In [6]:
db_measures = db_measures[["analysis_key", "complexity", "violations", "development_cost"]]
db_measures.head()

Unnamed: 0,analysis_key,complexity,violations,development_cost
0,AWedEXD3C4KKKThcCqHV,30703.0,16006,6207000
1,AWedCu_QC4KKKThcCqGv,30703.0,16006,6207000
2,AWedBDbIC4KKKThcCqGA,30703.0,16004,6207330
3,AWec_b8nC4KKKThcCqFg,30699.0,16000,6206280
4,AWec904qC4KKKThcCqE3,30670.0,15991,6200220


## **DATA CLEANING**

We proceed to clean the tables by deleting the NA only in the sonar_measures table as we want to keep the time consistency when computing the invervals between the selected variables.

In [7]:
db_measures = db_measures[db_measures["complexity"].notna()]

In [8]:
db_sonar0 = db_analysis.merge(db_measures, left_on="ANALYSIS_KEY", right_on="analysis_key")
db_sonar = db_sonar0.rename(columns = {'REVISION' : 'COMMIT_HASH'})
db_sonar = db_sonar[["COMMIT_HASH", "complexity", "violations", "development_cost"]]
db_sonar.head()

Unnamed: 0,COMMIT_HASH,complexity,violations,development_cost
0,b9988a83e364b9b470873dff8996dcf401d08dc4,30703.0,16006,6207000
1,5d351ea375f2b8d4ca0ae40b887392915983121a,30703.0,16006,6207000
2,2bb54cc863c9c599e316ae5da7ba1a77f42e9402,30703.0,16004,6207330
3,68b851d2adbbeee6014d1c37b9c07fdb4fc6820f,30699.0,16000,6206280
4,e6c057bd6a04ec286d4ac4da5d2a82696868b937,30670.0,15991,6200220


In [9]:
db_commits

Unnamed: 0,PROJECT_ID,COMMIT_HASH,COMMIT_MESSAGE,AUTHOR,COMMITTER_DATE
0,org.apache:archiva,94fe3a7fc056638c90cbe4a6319c3cb658f395a5,create template structure git-svn-id: https:...,Brett Porter,2005-11-23 23:54:12+00:00
1,org.apache:archiva,2b6be811c1d4f5e81339616208530d486608e42b,repository manager - root POM git-svn-id: ht...,Brett Porter,2005-11-28 03:15:35+00:00
2,org.apache:archiva,af0ddbcfc1e8c2528decbd458f42e16cbcae5051,update reports git-svn-id: https://svn.apach...,Brett Porter,2005-11-28 04:50:31+00:00
3,org.apache:archiva,ea640e1803a8535ebfba2ade4ea9272b240067e8,move discovery to a new component PR: MRM-9 ...,Brett Porter,2005-11-29 01:34:22+00:00
4,org.apache:archiva,4b1232d671e7c0aa44927ce2d4574d8cb751497d,add discovery module PR: MRM-9 git-svn-id: h...,Brett Porter,2005-11-29 01:43:41+00:00
...,...,...,...,...,...
153989,org.apache:santuario,3bbe9d92b996fed10440ba772e1be711c183f781,Minor optimisation\n\ngit-svn-id: https://svn....,Colm O hEigeartaigh,2019-06-25T13:59:59Z
153990,org.apache:santuario,0114371d12701f4358746d6e39d54efc48a2a332,Minor improvement to XMLUtils\n\ngit-svn-id: h...,Colm O hEigeartaigh,2019-07-11T14:50:10Z
153991,org.apache:santuario,c05467b72c02227ce64ca7000a670b6f1eb5bdf1,Updating some dependencies\n\ngit-svn-id: http...,Colm O hEigeartaigh,2019-07-12T08:12:52Z
153992,org.apache:santuario,5aa25ac67d4e4efae605220f84935145827247e9,Removing XMLSignatureInput fallback\n\ngit-svn...,Colm O hEigeartaigh,2019-07-12T11:00:28Z


In [10]:
db_sonar

Unnamed: 0,COMMIT_HASH,complexity,violations,development_cost
0,b9988a83e364b9b470873dff8996dcf401d08dc4,30703.0,16006,6207000
1,5d351ea375f2b8d4ca0ae40b887392915983121a,30703.0,16006,6207000
2,2bb54cc863c9c599e316ae5da7ba1a77f42e9402,30703.0,16004,6207330
3,68b851d2adbbeee6014d1c37b9c07fdb4fc6820f,30699.0,16000,6206280
4,e6c057bd6a04ec286d4ac4da5d2a82696868b937,30670.0,15991,6200220
...,...,...,...,...
66678,877a731524e103ca69900c9d4d63350956947e2a,9031.0,4639,1840800
66679,693f81bd103d14c80c38609bbc6cddb356310b3d,9031.0,4639,1840800
66680,33b652ebef8af747cc1fae326a1e58f728e62adc,9029.0,4628,1839000
66681,4fcc59ff15bfd7e456ae342581eff330923edac5,9029.0,4628,1839000


In [11]:
db_merged = pd.merge(db_commits, db_sonar, how = 'left', on = 'COMMIT_HASH', indicator = True)
db_merged.head()

Unnamed: 0,PROJECT_ID,COMMIT_HASH,COMMIT_MESSAGE,AUTHOR,COMMITTER_DATE,complexity,violations,development_cost,_merge
0,org.apache:archiva,94fe3a7fc056638c90cbe4a6319c3cb658f395a5,create template structure git-svn-id: https:...,Brett Porter,2005-11-23 23:54:12+00:00,,,,left_only
1,org.apache:archiva,2b6be811c1d4f5e81339616208530d486608e42b,repository manager - root POM git-svn-id: ht...,Brett Porter,2005-11-28 03:15:35+00:00,,,,left_only
2,org.apache:archiva,af0ddbcfc1e8c2528decbd458f42e16cbcae5051,update reports git-svn-id: https://svn.apach...,Brett Porter,2005-11-28 04:50:31+00:00,,,,left_only
3,org.apache:archiva,ea640e1803a8535ebfba2ade4ea9272b240067e8,move discovery to a new component PR: MRM-9 ...,Brett Porter,2005-11-29 01:34:22+00:00,47.0,23.0,18540.0,both
4,org.apache:archiva,4b1232d671e7c0aa44927ce2d4574d8cb751497d,add discovery module PR: MRM-9 git-svn-id: h...,Brett Porter,2005-11-29 01:43:41+00:00,47.0,23.0,18540.0,both


In [12]:
db_merged = db_merged.sort_values(by=["PROJECT_ID","COMMITTER_DATE"]).reset_index()

db_merged['inc_complexity'] = float("Nan")
db_merged['inc_violations'] = float("Nan")
db_merged['inc_development_cost'] = float("Nan")

db_merged.append
for i in range(1,db_merged.shape[0]):

    #first we make sure that both entries are from the same project (if not leave with the Nan value in the increment variable)
    if (db_merged['PROJECT_ID'][i] == db_merged['PROJECT_ID'][i-1]):

        for inc_variable in [["complexity",'inc_complexity'],["violations", "inc_violations"],["development_cost", "inc_development_cost"]]:
            variable_act = db_merged[inc_variable[0]][i] #value for the variable in the row i
            variable_past =  db_merged[inc_variable[0]][i-1] #value for the variable in the row before i
            
            if pd.notna(variable_act) and pd.notna(variable_past): #both entries available
                db_merged[inc_variable[1]][i] = variable_act - variable_past
            else:
                break

db_increases = db_merged[db_merged['inc_complexity'].notna() & db_merged['inc_violations'].notna() & db_merged['inc_development_cost'].notna()]

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [13]:
clean_db_merged = db_merged.dropna() #we delete al the NAs in the table

final_db = clean_db_merged[["PROJECT_ID", "COMMIT_HASH", "COMMIT_MESSAGE", "AUTHOR", "COMMITTER_DATE", "inc_complexity", "inc_violations", "inc_development_cost"]]
final_db.head()

Unnamed: 0,PROJECT_ID,COMMIT_HASH,COMMIT_MESSAGE,AUTHOR,COMMITTER_DATE,inc_complexity,inc_violations,inc_development_cost
4,org.apache:archiva,4b1232d671e7c0aa44927ce2d4574d8cb751497d,add discovery module PR: MRM-9 git-svn-id: h...,Brett Porter,2005-11-29 01:43:41+00:00,0.0,0.0,0.0
5,org.apache:archiva,41c7634b39dadb0b6b2e6dbfee3f8828aace653c,add discovery module git-svn-id: https://svn...,Brett Porter,2005-11-29 01:45:24+00:00,0.0,0.0,180.0
6,org.apache:archiva,7f0762951b33206c1a7e0da8de9506d6bca96133,add test PR: MRM-9 git-svn-id: https://svn.a...,Brett Porter,2005-11-29 02:25:03+00:00,5.0,1.0,1530.0
7,org.apache:archiva,8e757bd2a0faec0732b512ea1d4df1e082aea6ff,add some more tests PR: MRM-9 git-svn-id: htt...,Brett Porter,2005-11-29 03:11:23+00:00,7.0,5.0,990.0
8,org.apache:archiva,9c694d371a9a9d3635dedb88b3ad90e8d450544e,complete default discoverer tests PR: MRM-9 g...,Brett Porter,2005-11-29 04:01:17+00:00,4.0,2.0,510.0


In [21]:
print(np.mean(final_db["inc_complexity"]),np.var(final_db["inc_complexity"]),max(final_db["inc_complexity"]),min(final_db["inc_complexity"]))
print(np.mean(final_db["inc_violations"]),np.var(final_db["inc_violations"]),max(final_db["inc_violations"]),min(final_db["inc_violations"]))
print(np.mean(final_db["inc_development_cost"]),np.var(final_db["inc_development_cost"]),max(final_db["inc_development_cost"]),min(final_db["inc_development_cost"]))

4.265460845240555 258242.8064954299 15921.0 -15919.0
1.9401115755678116 31177.24647297895 11079.0 -6437.0
838.8017546926903 6004734069.834295 2978520.0 -2978520.0


In [None]:
final_db.to_csv('../data/processed/predictionDB.csv', index='False') #export!
print(final_db.shape)