In [40]:
import numpy as np
import pandas as pd
import math

## **DATA SELECTION**

First, we select the tables that contains the variables of interest

In [41]:
path1 = "GIT_COMMITS.csv"
path2 = "SONAR_ANALYSIS.csv"
path3 = "SONAR_MEASURES.csv"
db_commits = pd.read_csv(path1)
db_analysis = pd.read_csv(path2)
db_measures = pd.read_csv(path3)

Secondly, for each table, we select those variables:

    · GIT_COMMITS: project id, commit hash, commit message, author and committer date
    · SONAR_ANALYSIS: revision and anlysis key
    · SONAR_MESURES: analysis key, complexity, violations and development cost 

In [42]:
db_commits = db_commits[["PROJECT_ID", "COMMIT_HASH", "COMMIT_MESSAGE", "AUTHOR",  "COMMITER_DATE"]]
db_commits.head()

Unnamed: 0,PROJECT_ID,COMMIT_HASH,COMMIT_MESSAGE,AUTHOR,COMMITER_DATE
0,org.apache:batik,52fc76012c5f969145c39d3fb398a7c2c094474f,New repository initialized by cvs2svn.\n\ngit-...,No Author,2000-10-01T07:37:01Z
1,org.apache:batik,b1ff4af6abfec32fc710d77795bb20a612a82126,Initial revision\n\n\ngit-svn-id: https://svn....,James Duncan Davidson,2000-10-01T07:37:01Z
2,org.apache:batik,c8d7a13470987f892f7466d55c10e3cee34de31d,Update\nPR:\n\n\ngit-svn-id: https://svn.apach...,James Duncan Davidson,2000-10-01T07:40:39Z
3,org.apache:batik,93a16402b48ae1cf70ea4bd030479170749bd10a,Added question line (more a test of list/forwa...,James Duncan Davidson,2000-10-01T08:15:04Z
4,org.apache:batik,fcaecb541edc03f36b6ec7a792e97dfeabf26117,testing commit\n\n\ngit-svn-id: https://svn.ap...,Dean Jackson,2000-10-02T13:33:11Z


In [43]:
db_analysis = db_analysis[["REVISION", "ANALYSIS_KEY"]]
db_analysis.head()

Unnamed: 0,REVISION,ANALYSIS_KEY
0,b9988a83e364b9b470873dff8996dcf401d08dc4,AWedEXD3C4KKKThcCqHV
1,5d351ea375f2b8d4ca0ae40b887392915983121a,AWedCu_QC4KKKThcCqGv
2,2bb54cc863c9c599e316ae5da7ba1a77f42e9402,AWedBDbIC4KKKThcCqGA
3,68b851d2adbbeee6014d1c37b9c07fdb4fc6820f,AWec_b8nC4KKKThcCqFg
4,e6c057bd6a04ec286d4ac4da5d2a82696868b937,AWec904qC4KKKThcCqE3


In [44]:
db_measures = db_measures[["analysis_key", "complexity", "violations", "development_cost"]]
db_measures.head()

Unnamed: 0,analysis_key,complexity,violations,development_cost
0,AWedEXD3C4KKKThcCqHV,30703.0,16006,6207000
1,AWedCu_QC4KKKThcCqGv,30703.0,16006,6207000
2,AWedBDbIC4KKKThcCqGA,30703.0,16004,6207330
3,AWec_b8nC4KKKThcCqFg,30699.0,16000,6206280
4,AWec904qC4KKKThcCqE3,30670.0,15991,6200220


## **DATA CLEANING**

We proceed to clean the tables by deleting the NA only in the sonar_measures table as we want to keep the time consistency when computing the invervals between the selected variables.

In [45]:
db_measures = db_measures[db_measures["complexity"].notna()]

In [46]:
db_sonar0 = db_analysis.merge(db_measures, left_on="ANALYSIS_KEY", right_on="analysis_key")
db_sonar = db_sonar0.rename(columns = {'REVISION' : 'COMMIT_HASH'})
db_sonar = db_sonar[["COMMIT_HASH", "complexity", "violations", "development_cost"]]
db_sonar.head()

Unnamed: 0,COMMIT_HASH,complexity,violations,development_cost
0,b9988a83e364b9b470873dff8996dcf401d08dc4,30703.0,16006,6207000
1,5d351ea375f2b8d4ca0ae40b887392915983121a,30703.0,16006,6207000
2,2bb54cc863c9c599e316ae5da7ba1a77f42e9402,30703.0,16004,6207330
3,68b851d2adbbeee6014d1c37b9c07fdb4fc6820f,30699.0,16000,6206280
4,e6c057bd6a04ec286d4ac4da5d2a82696868b937,30670.0,15991,6200220


In [47]:
db_commits

Unnamed: 0,PROJECT_ID,COMMIT_HASH,COMMIT_MESSAGE,AUTHOR,COMMITER_DATE
0,org.apache:batik,52fc76012c5f969145c39d3fb398a7c2c094474f,New repository initialized by cvs2svn.\n\ngit-...,No Author,2000-10-01T07:37:01Z
1,org.apache:batik,b1ff4af6abfec32fc710d77795bb20a612a82126,Initial revision\n\n\ngit-svn-id: https://svn....,James Duncan Davidson,2000-10-01T07:37:01Z
2,org.apache:batik,c8d7a13470987f892f7466d55c10e3cee34de31d,Update\nPR:\n\n\ngit-svn-id: https://svn.apach...,James Duncan Davidson,2000-10-01T07:40:39Z
3,org.apache:batik,93a16402b48ae1cf70ea4bd030479170749bd10a,Added question line (more a test of list/forwa...,James Duncan Davidson,2000-10-01T08:15:04Z
4,org.apache:batik,fcaecb541edc03f36b6ec7a792e97dfeabf26117,testing commit\n\n\ngit-svn-id: https://svn.ap...,Dean Jackson,2000-10-02T13:33:11Z
...,...,...,...,...,...
81067,org.apache:thrift,53d9c0c20bd5af65676928b9b7a73dcb2cad3d78,Merging EOFException changes from Ben Maurer ...,Mark Slee,2007-11-26 21:15:40+00:00
81068,org.apache:thrift,5ab570558f55d73472fbf6c0e66e6e165093c7d8,Fix writeContainerEnd call being inside loop i...,Mark Slee,2007-11-27 08:38:16+00:00
81069,org.apache:thrift,844ac12489600d7647f01ab4f9b99d9e1b81e69e,TJSONProtocol writing support in Java Summary...,Mark Slee,2007-11-27 08:38:52+00:00
81070,org.apache:thrift,256bdc444866b90bbdccfb5343e9c9ea8c22603c,IPv6 tweaks for Thrift Summary: Need to pass ...,Mark Slee,2007-11-27 08:42:19+00:00


In [48]:
db_sonar

Unnamed: 0,COMMIT_HASH,complexity,violations,development_cost
0,b9988a83e364b9b470873dff8996dcf401d08dc4,30703.0,16006,6207000
1,5d351ea375f2b8d4ca0ae40b887392915983121a,30703.0,16006,6207000
2,2bb54cc863c9c599e316ae5da7ba1a77f42e9402,30703.0,16004,6207330
3,68b851d2adbbeee6014d1c37b9c07fdb4fc6820f,30699.0,16000,6206280
4,e6c057bd6a04ec286d4ac4da5d2a82696868b937,30670.0,15991,6200220
...,...,...,...,...
66678,877a731524e103ca69900c9d4d63350956947e2a,9031.0,4639,1840800
66679,693f81bd103d14c80c38609bbc6cddb356310b3d,9031.0,4639,1840800
66680,33b652ebef8af747cc1fae326a1e58f728e62adc,9029.0,4628,1839000
66681,4fcc59ff15bfd7e456ae342581eff330923edac5,9029.0,4628,1839000


In [49]:
db_merged = pd.merge(db_commits, db_sonar, how = 'left', on = 'COMMIT_HASH', indicator = True)
db_merged.head()

Unnamed: 0,PROJECT_ID,COMMIT_HASH,COMMIT_MESSAGE,AUTHOR,COMMITER_DATE,complexity,violations,development_cost,_merge
0,org.apache:batik,52fc76012c5f969145c39d3fb398a7c2c094474f,New repository initialized by cvs2svn.\n\ngit-...,No Author,2000-10-01T07:37:01Z,,,,left_only
1,org.apache:batik,b1ff4af6abfec32fc710d77795bb20a612a82126,Initial revision\n\n\ngit-svn-id: https://svn....,James Duncan Davidson,2000-10-01T07:37:01Z,,,,left_only
2,org.apache:batik,c8d7a13470987f892f7466d55c10e3cee34de31d,Update\nPR:\n\n\ngit-svn-id: https://svn.apach...,James Duncan Davidson,2000-10-01T07:40:39Z,,,,left_only
3,org.apache:batik,93a16402b48ae1cf70ea4bd030479170749bd10a,Added question line (more a test of list/forwa...,James Duncan Davidson,2000-10-01T08:15:04Z,,,,left_only
4,org.apache:batik,fcaecb541edc03f36b6ec7a792e97dfeabf26117,testing commit\n\n\ngit-svn-id: https://svn.ap...,Dean Jackson,2000-10-02T13:33:11Z,,,,left_only


In [51]:
db_merged = db_merged.sort_values(by=["PROJECT_ID","COMMITER_DATE"]).reset_index()

db_merged['inc_complexity'] = float("Nan")
db_merged['inc_violations'] = float("Nan")
db_merged['inc_development_cost'] = float("Nan")

db_merged.append
for i in range(1,db_merged.shape[0]):

    #first we make sure that both entries are from the same project (if not leave with the Nan value in the increment variable)
    if (db_merged['PROJECT_ID'][i] == db_merged['PROJECT_ID'][i-1]):

        for inc_variable in [["complexity",'inc_complexity'],["violations", "inc_violations"],["development_cost", "inc_development_cost"]]:
            variable_act = db_merged[inc_variable[0]][i] #value for the variable in the row i
            variable_past =  db_merged[inc_variable[0]][i-1] #value for the variable in the row before i
            
            if pd.notna(variable_act) and pd.notna(variable_past): #both entries available
                db_merged[inc_variable[1]][i] = variable_act - variable_past
            else:
                break

db_increases = db_merged[db_merged['inc_complexity'].notna() and db_merged['inc_violations'].notna() and db_merged['inc_development_cost'].notna()]

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [67]:
clean_db_merged = db_merged.dropna() #we delete al the NAs in the table

final_db = clean_db_merged[["PROJECT_ID", "COMMIT_HASH", "COMMIT_MESSAGE", "AUTHOR", "COMMITER_DATE", "inc_complexity", "inc_violations", "inc_development_cost"]]
final_db.head()

Unnamed: 0,PROJECT_ID,COMMIT_HASH,COMMIT_MESSAGE,AUTHOR,COMMITER_DATE,inc_complexity,inc_violations,inc_development_cost
4,org.apache:archiva,4b1232d671e7c0aa44927ce2d4574d8cb751497d,add discovery module PR: MRM-9 git-svn-id: h...,Brett Porter,2005-11-29 01:43:41+00:00,0.0,0.0,0.0
5,org.apache:archiva,41c7634b39dadb0b6b2e6dbfee3f8828aace653c,add discovery module git-svn-id: https://svn...,Brett Porter,2005-11-29 01:45:24+00:00,0.0,0.0,180.0
6,org.apache:archiva,7f0762951b33206c1a7e0da8de9506d6bca96133,add test PR: MRM-9 git-svn-id: https://svn.a...,Brett Porter,2005-11-29 02:25:03+00:00,5.0,1.0,1530.0
7,org.apache:archiva,8e757bd2a0faec0732b512ea1d4df1e082aea6ff,add some more tests PR: MRM-9 git-svn-id: htt...,Brett Porter,2005-11-29 03:11:23+00:00,7.0,5.0,990.0
8,org.apache:archiva,9c694d371a9a9d3635dedb88b3ad90e8d450544e,complete default discoverer tests PR: MRM-9 g...,Brett Porter,2005-11-29 04:01:17+00:00,4.0,2.0,510.0


In [68]:
final_db.to_csv('predictionDB.csv', index='False') #export!