In [124]:
from pydriller import Repository
from pydriller.metrics.process.code_churn import CodeChurn
import datetime
import pandas as pd
import os

# Funções

In [125]:
def process_modified_files(my_repository, my_from_commit, my_to_commit):
    my_list = []
    a = datetime.datetime.now()
    print('Starting process of modified files of {} from {} to {}'.format(my_repository, my_from_commit, my_to_commit))
    for commit in Repository(my_repository, from_commit=my_from_commit, to_commit=my_to_commit).traverse_commits():
        for m in commit.modified_files:
            print(
                "Author {}".format(commit.author.name),
                " modified {}".format(m.filename),
                " path {}".format(m._old_path),
                " path {}".format(m._new_path),
                " with a change type of {}".format(m.change_type.name),
                " and the complexity is {}".format(m.complexity)
            )
            my_list.append((m.filename, m._old_path, m._new_path, m.complexity, m.change_type.name))
    b = datetime.datetime.now()
    delta = b-a
    print('Process complete in : {}'.format(delta))
    return my_list
    
def list_of_commits(my_repository):
    my_list = []
    i = 1
    for commit in Repository(my_repository).traverse_commits():
        my_list.append((i, commit.hash))
        i = i + 1
    return my_list

def boundary_commits(list_of_commits, from_index_commit, to_index_commit, my_version, my_sequence):
    from_commit, to_commit = list_of_commits[from_index_commit][1], list_of_commits[to_index_commit][1]
    print('v{}: {}, from commit: {}, to commit: {}'.format(my_sequence, my_version, from_commit, to_commit))
    return from_commit, to_commit

def process_code_churn(my_repository, my_from_commit, my_to_commit):
    a = datetime.datetime.now()
    print('Processing code churn from {} from {} to {} ...'.format(my_repository, my_from_commit, my_to_commit))
    my_metrics = CodeChurn(path_to_repo=my_repository,
                    from_commit=my_from_commit,
                    to_commit=my_to_commit)
    b = datetime.datetime.now()
    delta = b - a
    print('Complete in {}!'.format(delta))
    return my_metrics

def process_files_count_max_avg(my_metrics):
    files_count = my_metrics.count()                   
    files_max = my_metrics.max()
    files_avg = my_metrics.avg()
    print('Total code churn for each file: {}'.format(files_count))
    print('Maximum code churn for each file: {}'.format(files_max))
    print('Average code churn for each file: {}'.format(files_avg))
    return files_count, files_max, files_avg

def first_last_commit(list_of_commits, my_version, my_sequence):
    first_commit, last_commit = list_of_commits[0][1], list_of_commits[-1][1]
    print('v{}: {}, first commit: {}, last commit: {}'.format(my_sequence, my_version, first_commit, last_commit))
    return first_commit, last_commit

In [153]:
def createEmptyFile(path):
    with open(path, 'a') as file:
        os.utime(path, None)
    return file

def create_new_file_with_complexity(my_directory, my_new_file, my_list):
    a = datetime.datetime.now()
    print('Processing {}'.format(my_new_file))
    file_csv = my_directory + '/' + my_new_file + '.csv'
    if os.path.exists(file_csv):
        os.remove(file_csv)
    f_csv = createEmptyFile(file_csv)
    head = 'file,complexity' + '\n'
    f_csv = open(file_csv,'w')
    f_csv.write(head)
    f_csv.close()
    for each in my_list:
        my_path = None
        #m.filename, m._old_path, m._new_path, m.complexity, m.change_type.name
        if (each[1] is None) and (each[2] is None):
            continue
        if (each[1] is not None) and (each[2] is None):
            my_path = each[1]
        if (each[1] is None) and (each[2] is not None):
            my_path = each[2]
        if (my_path is not None): 
            my_complexity = each[3]
            with open(file_csv, 'a', encoding='utf-8') as f_csv:
                my_line = str(my_path) + ',' + str(my_complexity) + '\n'
                print(my_line)
                f_csv.write(my_line)
        print('File {}.csv updated in {} with success!'.format(my_new_file, my_directory))
    b = datetime.datetime.now()
    delta = b-a
    print('Complete as: {}'.format(delta))

def filter_only_java_file_with_complexity(my_directory, my_file, my_new_file):
    a = datetime.datetime.now()
    print('Processing {}'.format(my_file))
    new_file_csv = my_directory + '/' + my_new_file + '.csv'
    if os.path.exists(new_file_csv):
        os.remove(new_file_csv)
    f_csv = createEmptyFile(new_file_csv)
    head = 'file,complexity' + '\n'
    f_csv = open(new_file_csv,'w')
    f_csv.write(head)
    f_csv.close()
    my_file = my_directory + '/' + my_file + '.csv'
    with open(my_file, 'r', encoding='utf-8') as f_my_file:
        for line in f_my_file:
            line = str(line)
            if ('.java' in line):
                with open(new_file_csv, 'a', encoding='utf-8') as f_csv:
                    my_new_line = line
                    print(my_new_line)
                    print('File {}.csv updated in {} with success!'.format(my_file, my_directory))
                    f_csv.write(my_new_line)
    b = datetime.datetime.now()
    delta = b-a
    print('Complete as: {}'.format(delta))

def create_new_file_with_code_churn(my_directory, my_new_file, my_files_count):
    a = datetime.datetime.now()
    print('Processing {}'.format(my_new_file))
    file_csv = my_directory + '/' + my_new_file + '.csv'
    if os.path.exists(file_csv):
        os.remove(file_csv)
    f_csv = createEmptyFile(file_csv)
    head = 'file,codechurn' + '\n'
    f_csv = open(file_csv,'w')
    f_csv.write(head)
    f_csv.close()
    for key, value in my_files_count.items():
        if (key is not None) and (value is not None):
            temp = str(key)
            temp2 = str(value)
            with open(file_csv, 'a', encoding='utf-8') as f_csv:
                my_line = key + ',' + str(value) + '\n'
                print(my_line)
                f_csv.write(my_line)
        print('File {}.csv updated in {} with success!'.format(my_new_file, my_directory))
    b = datetime.datetime.now()
    delta = b-a
    print('Complete as: {}'.format(delta))

# 1. Dados dos repositórios analisados

In [136]:
hadoop_dir = '/Users/armandosoaressousa/git/hadoop'
hadoop_dir_testes_pydriller = '/Users/armandosoaressousa/git/hadoop' + '/' + 'testes' + '/' + 'pydriller' 

v1 = 'hadoop-branch-2.0.5'
v2 = 'hadoop-branch-3.0.0'
v3 = 'hadoop-branch-3.1'
v4 = 'hadoop-branch-3.2'

my_repository_v1 = hadoop_dir + '/' + v1
my_repository_v2 = hadoop_dir + '/' + v2
my_repository_v3 = hadoop_dir + '/' + v3
my_repository_v4 = hadoop_dir + '/' + v4
 
print('my_repository_v1: {}'.format(my_repository_v1))
print('my_repository_v2: {}'.format(my_repository_v2))
print('my_repository_v3: {}'.format(my_repository_v3))
print('my_repository_v4: {}'.format(my_repository_v4))

my_repository_v1: /Users/armandosoaressousa/git/hadoop/hadoop-branch-2.0.5
my_repository_v2: /Users/armandosoaressousa/git/hadoop/hadoop-branch-3.0.0
my_repository_v3: /Users/armandosoaressousa/git/hadoop/hadoop-branch-3.1
my_repository_v4: /Users/armandosoaressousa/git/hadoop/hadoop-branch-3.2


# 2. Lista dos commits de cada versão analisada

In [128]:
list_of_commits_v1 = list_of_commits(my_repository_v1)
list_of_commits_v2 = list_of_commits(my_repository_v2)
list_of_commits_v3 = list_of_commits(my_repository_v3)
list_of_commits_v4 = list_of_commits(my_repository_v4)

In [129]:
first_commit_v1, last_commit_v1 = first_last_commit(list_of_commits_v1, v1, 1)
first_commit_v2, last_commit_v2 = first_last_commit(list_of_commits_v2, v2, 2)
first_commit_v3, last_commit_v3 = first_last_commit(list_of_commits_v3, v3, 3)
first_commit_v4, last_commit_v4 = first_last_commit(list_of_commits_v4, v4, 4)

v1: hadoop-branch-2.0.5, first commit: 5128a9a453d64bfe1ed978cf9ffed27985eeef36, last commit: a882b627e3e70d8f62b23bfb5517f0aad832fb46
v2: hadoop-branch-3.0.0, first commit: 5128a9a453d64bfe1ed978cf9ffed27985eeef36, last commit: df2ae271ccfbd562f9132d5ba795e638d38cb04a
v3: hadoop-branch-3.1, first commit: 5128a9a453d64bfe1ed978cf9ffed27985eeef36, last commit: e0b84c45d10823e16c1d0d2f4fe108a1f7af87cf
v4: hadoop-branch-3.2, first commit: 5128a9a453d64bfe1ed978cf9ffed27985eeef36, last commit: bc97dd0d267dd7b970e18c577ffdb749b819e15d


In [138]:
print('Total of commits and diff of commits')
print('{}, {}, {}, {}'.format(len(list_of_commits_v1), len(list_of_commits_v2), len(list_of_commits_v3), len(list_of_commits_v4)))
print('{}, {}, {}'.format( (len(list_of_commits_v2) - len(list_of_commits_v1)) , (len(list_of_commits_v3) - len(list_of_commits_v2)), (len(list_of_commits_v4) - len(list_of_commits_v3)) ))

Total of commits and diff of commits
3847, 17119, 19896, 21586
13272, 2777, 1690


In [131]:
list_of_commits = list_of_commits_v1
from_index_commit = 0
to_index_commit = len(list_of_commits_v1)-1
my_version = v1
my_sequence = 1
from_commit_v1, to_commit_v1 = boundary_commits(list_of_commits, from_index_commit, to_index_commit, my_version, my_sequence)

list_of_commits = list_of_commits_v2
from_index_commit = len(list_of_commits_v1)
to_index_commit = len(list_of_commits_v2)-1
my_version = v2
my_sequence = 2
from_commit_v2, to_commit_v2 = boundary_commits(list_of_commits, from_index_commit, to_index_commit, my_version, my_sequence)

list_of_commits = list_of_commits_v3
from_index_commit = len(list_of_commits_v2)
to_index_commit = len(list_of_commits_v3)-1
my_version = v3
my_sequence = 3
from_commit_v3, to_commit_v3 = boundary_commits(list_of_commits, from_index_commit, to_index_commit, my_version, my_sequence)

list_of_commits = list_of_commits_v4
from_index_commit = len(list_of_commits_v3)
to_index_commit = len(list_of_commits_v4)-1
my_version = v4
my_sequence = 4
from_commit_v4, to_commit_v4 = boundary_commits(list_of_commits, from_index_commit, to_index_commit, my_version, my_sequence)

v1: hadoop-branch-2.0.5, from commit: 5128a9a453d64bfe1ed978cf9ffed27985eeef36, to commit: a882b627e3e70d8f62b23bfb5517f0aad832fb46
v2: hadoop-branch-3.0.0, from commit: 73e9366510d9778f8aa829d72adc14cca669dc5e, to commit: df2ae271ccfbd562f9132d5ba795e638d38cb04a
v3: hadoop-branch-3.1, from commit: d00b6f7c1ff2d7569ae9efdc6823ebcfb86ef2d4, to commit: e0b84c45d10823e16c1d0d2f4fe108a1f7af87cf
v4: hadoop-branch-3.2, from commit: eb0b5a844f960017f6f48d746174d0f5826f0e5f, to commit: bc97dd0d267dd7b970e18c577ffdb749b819e15d


# 3. Análise da versão v1 (hadoop-branch-2.0.5)

## 3.1 Lista dos arquivos modificados e respectiva complexidade para os arquivos de v1

In [96]:
list_of_modified_files_v1 = process_modified_files(my_repository_v1, list_of_commits_v1[0][1], list_of_commits_v1[-1][1])

he/hadoop/hdfs/NameNodeProxies.java  with a change type of MODIFY  and the complexity is 39
Author Konstantin Boudnik  modified CHANGES.txt  path hadoop-common-project/hadoop-common/CHANGES.txt  path hadoop-common-project/hadoop-common/CHANGES.txt  with a change type of MODIFY  and the complexity is None
Author Konstantin Boudnik  modified hadoop-daemon.sh  path hadoop-common-project/hadoop-common/src/main/bin/hadoop-daemon.sh  path hadoop-common-project/hadoop-common/src/main/bin/hadoop-daemon.sh  with a change type of MODIFY  and the complexity is None
Author Konstantin Boudnik  modified yarn-daemon.sh  path hadoop-yarn-project/hadoop-yarn/bin/yarn-daemon.sh  path hadoop-yarn-project/hadoop-yarn/bin/yarn-daemon.sh  with a change type of MODIFY  and the complexity is None
Author Vinod Kumar Vavilapalli  modified CHANGES.txt  path hadoop-mapreduce-project/CHANGES.txt  path hadoop-mapreduce-project/CHANGES.txt  with a change type of MODIFY  and the complexity is None
Author Vinod Kumar 

## 3.2 Criar arquivo csv para complexidade dos arquivos de v1

In [139]:
create_new_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-2.0.5-ccn', list_of_modified_files_v1)

pdated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/compress/bzip2/org_apache_hadoop_io_compress_bzip2.h,None

File hadoop-hdfs-2.0.5-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/compress/bzip2/org_apache_hadoop_io_compress_bzip2.h,None

File hadoop-hdfs-2.0.5-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/compress/bzip2/org_apache_hadoop_io_compress_bzip2.h,None

File hadoop-hdfs-2.0.5-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/compress/bzip2/org_apache_hadoop_io_compress_bzip2.h,None

File hadoop-hdfs-2.0.5-ccn.csv updated in /Users

## 3.3 Filtrar apenas os arquivos .java de v1

In [143]:
filter_only_java_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-2.0.5-ccn', 'hadoop-branch-2.0.5-ccn-java')

l/TestClassUtil.java,None

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-hdfs-2.0.5-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestClassUtil.java,8

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-hdfs-2.0.5-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/webapp/view/TestInfoBlock.java,6

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-hdfs-2.0.5-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/webapp/view/TestInfoBlock.java,None

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-hdfs-2.0.5-ccn.csv.csv updated in /Users/armandosoaressousa/g

## 3.4 Lista de métricas de code churn para os arquivos de v1

In [97]:
my_metrics_v1 = process_code_churn(my_repository_v1, list_of_commits_v1[0][1], list_of_commits_v1[-1][1])

Processing code churn from /Users/armandosoaressousa/git/hadoop/hadoop-branch-2.0.5 from 5128a9a453d64bfe1ed978cf9ffed27985eeef36 to a882b627e3e70d8f62b23bfb5517f0aad832fb46 ...
Complete in 0:03:29.862813!


In [144]:
files_count_v1, files_max_v1, files_avg_v1 = process_files_count_max_avg(my_metrics_v1)

rc/test/hdfs-with-mr/org/apache/hadoop/hdfs/NNBench.java': 964, 'src/test/hdfs-with-mr/org/apache/hadoop/hdfs/NNBenchWithoutMR.java': 344, 'src/test/hdfs-with-mr/org/apache/hadoop/io/FileBench.java': 603, 'src/test/hdfs-with-mr/org/apache/hadoop/io/TestSequenceFileMergeProgress.java': 98, 'src/test/hdfs-with-mr/org/apache/hadoop/ipc/TestSocketFactory.java': 197, 'src/test/hdfs-with-mr/org/apache/hadoop/security/authorize/TestServiceLevelAuthorization.java': 152, 'src/test/hdfs-with-mr/org/apache/hadoop/test/AllTestDriver.java': 46, 'src/test/hdfs-with-mr/org/apache/hadoop/test/HdfsWithMRTestDriver.java': 75, 'src/test/hdfs-with-mr/org/apache/hadoop/tools/TestDistCh.java': 221, 'src/test/mapred-site.xml': 18, 'src/native/INSTALL': 236, 'src/native/config/config.guess': 1477, 'src/native/config/config.sub': 1566, 'src/native/config/depcomp': 530, 'src/native/config/install-sh': 323, 'src/native/config/ltmain.sh': 6971, 'src/native/config/missing': 360, 'src/native/lib/Makefile.in': 424, 

## 3.5 Total code churn for each file in v1

In [145]:
for key, value in files_count_v1.items():
    print(key, value)

.templates/.project 27
.eclipse.templates/README.txt 6
src/contrib/eclipse-plugin/build.xml 80
src/java/org/apache/hadoop/filecache/DistributedCache.java 862
src/java/org/apache/hadoop/util/ProcessTree.java 313
src/java/org/apache/hadoop/util/ProcfsBasedProcessTree.java 526
src/test/core/org/apache/hadoop/util/TestProcfsBasedProcessTree.java 468
src/webapps/static/hadoop-logo.jpg 0
lib/commons-cli-2.0-SNAPSHOT.jar 0
lib/hsqldb-1.8.0.10.jar 0
lib/kfs-0.2.2.jar 0
src/test/core/org/apache/hadoop/security/TestAccessToken.java 89
src/test/lib/ftplet-api-1.0.0-SNAPSHOT.jar 0
src/test/lib/ftpserver-core-1.0.0-SNAPSHOT.jar 0
src/test/lib/ftpserver-server-1.0.0-SNAPSHOT.jar 0
src/test/lib/mina-core-2.0.0-M2-20080407.124109-12.jar 0
src/test/hdfs-site.xml 9
src/test/hdfs-with-mr/org/apache/hadoop/fs/AccumulatingReducer.java 103
src/test/hdfs-with-mr/org/apache/hadoop/fs/DFSCIOTest.java 551
src/test/hdfs-with-mr/org/apache/hadoop/fs/DistributedFSCheck.java 353
src/test/hdfs-with-mr/org/apache/had

## 3.6 Criar arquivo csv para code churn dos arquivos de v1

In [149]:
create_new_file_with_code_churn(hadoop_dir_testes_pydriller, 'hadoop-branch-2.0.5-churn', files_count_v1)

urn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/cli/util/CommandExecutor.java,111

File hadoop-hdfs-2.0.5-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/cli/util/ComparatorBase.java,39

File hadoop-hdfs-2.0.5-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/cli/util/ComparatorData.java,106

File hadoop-hdfs-2.0.5-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/cli/util/ExactComparator.java,34

File hadoop-hdfs-2.0.5-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/cli/util/RegexpAcrossOutputComparator.java,39

File hadoop-hdfs-2.0.5-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/cli/util

## 3.6.1 Filtrar apenas arquivos .java

In [150]:
filter_only_java_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-2.0.5-churn', 'hadoop-branch-2.0.5-churn-java')

/Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/fs/TestPath.java,152

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-hdfs-2.0.5-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/fs/TestTrash.java,313

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-hdfs-2.0.5-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/fs/TestTruncatedInputBug.java,109

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-hdfs-2.0.5-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/fs/kfs/KFSEmulationImpl.java,150

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-hdfs-2.0.5-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
src/test/org/apache/hadoop/fs/

# 4 Análise da versão v2 (hadoop-branch-3.0.0)

## 4.1 Lista dos arquivos modificados e respectiva complexidade para os arquivos de v2

In [151]:
list_of_modified_files_v2 = process_modified_files(my_repository_v2, from_commit_v2, to_commit_v2)

MODIFY  and the complexity is 31
Author Chris Douglas  modified GenericTestUtils.java  path hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/GenericTestUtils.java  path hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/GenericTestUtils.java  with a change type of MODIFY  and the complexity is 98
Author Chris Douglas  modified BlockManager.java  path hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java  path hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java  with a change type of MODIFY  and the complexity is 776
Author Chris Douglas  modified PendingRecoveryBlocks.java  path hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingRecoveryBlocks.java  path None  with a change type of DELETE  and the complexity is None
Author Chris Douglas  modified FSNamesystem.java  path hadoop-hdfs

## 4.2 Criar arquivo csv para complexidade dos arquivos de v2

In [155]:
create_new_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-3.0.0-ccn', list_of_modified_files_v2)

cn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.0.0-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.0.0-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.0.0-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.0.0-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.0.0-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.0.0-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.0.0-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.0.0-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch

## 4.3 Filtrar apenas os arquivos .java de v2

In [156]:
filter_only_java_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-3.0.0-ccn', 'hadoop-branch-3.0.0-ccn-java')

s/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.0.0-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-tools/hadoop-resourceestimator/src/test/java/org/apache/hadoop/resourceestimator/service/TestResourceEstimatorService.java,13

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.0.0-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-tools/hadoop-resourceestimator/src/test/java/org/apache/hadoop/resourceestimator/skylinestore/impl/TestInMemoryStore.java,1

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.0.0-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-tools/hadoop-resourceestimator/src/test/java/org/apache/hadoop/resourceestimator/skylinestore/impl/TestSkylineStore.java,35

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.0.0-ccn.csv.csv updated in 

## 4.4 Lista de métricas de code churn para os arquivos de v2

In [157]:
my_metrics_v2 = process_code_churn(my_repository_v2, from_commit_v2, to_commit_v2)

Processing code churn from /Users/armandosoaressousa/git/hadoop/hadoop-branch-3.0.0 from 73e9366510d9778f8aa829d72adc14cca669dc5e to df2ae271ccfbd562f9132d5ba795e638d38cb04a ...
Complete in 1:06:28.693052!


In [158]:
files_count_v2, files_max_v2, files_avg_v2 = process_files_count_max_avg(my_metrics_v2)

c/main/resources/webapps/static/jquery/themes-1.8.16/ui-lightness/images/ui-icons_ffffff_256x240.png': 0, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/vader/images/ui-bg_flat_0_aaaaaa_40x100.png': 0, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/vader/images/ui-bg_glass_95_fef1ec_1x400.png': 0, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/vader/images/ui-bg_gloss-wave_16_121212_500x100.png': 0, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/vader/images/ui-bg_highlight-hard_15_888888_1x100.png': 0, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/vader/images/ui-bg_highlight-hard_55_555555_1x100.png': 0, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static

## 4.5 Total code churn for each file in v2

In [159]:
for key, value in files_count_v2.items():
    print(key, value)

6/ui-lightness/images/ui-bg_highlight-soft_75_ffe45c_1x100.png 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/ui-lightness/images/ui-icons_222222_256x240.png 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/ui-lightness/images/ui-icons_228ef1_256x240.png 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/ui-lightness/images/ui-icons_ef8c08_256x240.png 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/ui-lightness/images/ui-icons_ffd27a_256x240.png 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/ui-lightness/images/ui-icons_ffffff_256x240.png 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/webapps/static/jquery/themes-1.8.16/vader/images/ui-bg_flat_0_aaaaaa_40x100.png 0
had

# 4.6 Criar arquivo csv para code churn dos arquivos de v2

In [160]:
create_new_file_with_code_churn(hadoop_dir_testes_pydriller, 'hadoop-branch-3.0.0-churn', files_count_v2)

n /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-mapreduce-project/src/contrib/eclipse-plugin/resources/map16x16.gif,0

File hadoop-branch-3.0.0-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-mapreduce-project/src/contrib/eclipse-plugin/resources/mapper16.png,0

File hadoop-branch-3.0.0-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-mapreduce-project/src/contrib/eclipse-plugin/resources/mapwiz.png,0

File hadoop-branch-3.0.0-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-mapreduce-project/src/contrib/eclipse-plugin/resources/new-folder.png,0

File hadoop-branch-3.0.0-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-mapreduce-project/src/contrib/eclipse-plugin/resources/projwiz.png,0

File hadoop-branch-3.0.0-churn.csv updated in /Users/armandosoaressousa/git/hadoop/te

## 4.6.1 Filtrar apenas arquivos .java

In [161]:
filter_only_java_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-3.0.0-churn', 'hadoop-branch-3.0.0-churn-java')

ith success!
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/util/TestProcfsBasedProcessTree.java,0

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.0.0-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/partition/RehashPartitioner.java,48

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.0.0-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/partition/TestRehashPartitioner.java,71

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.0.0-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydr

# 5.1 Lista dos arquivos modifica dos e respectiva complexidade para os arquivos de v3

In [162]:
list_of_modified_files_v3 = process_modified_files(my_repository_v3, from_commit_v3, to_commit_v3)

hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java  with a change type of MODIFY  and the complexity is 6
Author Eric Badger  modified ClusterMetricsInfo.java  path hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java  path hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java  with a change type of MODIFY  and the complexity is 71
Author Eric Badger  modified TestNodesPage.java  path hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java  path hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodes

# 5.2 Criar arquivo csv para complexidade dos arquivos de v3

In [163]:
create_new_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-3.1-ccn', list_of_modified_files_v3)


File hadoop-branch-3.1-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.1-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.1-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/site/markdown/release/2.9.2/CHANGELOG.2.9.2.md,None

File hadoop-branch-3.1-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/site/markdown/release/2.9.2/RELEASENOTES.2.9.2.md,None

File hadoop-branch-3.1-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.1-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.1-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.1-ccn.csv updated i

# 5.3 Filtrar apenas os arquivos .java de v3

In [164]:
# 5.3 Filtrar apenas os arquivos .java de v3
filter_only_java_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-3.1-ccn', 'hadoop-branch-3.1-ccn-java')

1-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockPlacementPolicyDebugLoggingBuilder.java,3

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.1-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBLocalClientFactory.java,None

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.1-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestDynamoDBMetadataStore.java,None

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.1-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-tools/hadoop-aws/src/test/java/org/

# 5.4 Lista de métricas de code churn para os arquivos de v3

In [165]:
my_metrics_v3 = process_code_churn(my_repository_v3, from_commit_v3, to_commit_v3)

Processing code churn from /Users/armandosoaressousa/git/hadoop/hadoop-branch-3.1 from d00b6f7c1ff2d7569ae9efdc6823ebcfb86ef2d4 to e0b84c45d10823e16c1d0d2f4fe108a1f7af87cf ...
Complete in 0:06:19.331503!


In [166]:
files_count_v3, files_max_v3, files_avg_v3 = process_files_count_max_avg(my_metrics_v3)

der/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/HttpCacheHeaders.java': 35, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/SliderAMController.java': 69, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/SliderDefaultWrapperServlet.java': 48, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/layout/AppLayout.java': 32, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/layout/ClusterSpecificationView.java': 32, 'hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/se

# 5.5 Total code churn for each file in v3

In [167]:
for key, value in files_count_v3.items():
    print(key, value)

java 75
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/state/SimpleReleaseSelector.java 33
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/AgentService.java 37
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/HttpCacheHeaders.java 35
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/SliderAMController.java 69
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/SliderDefaultWrapperServlet.java 48
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yar

# 5.6 Criar arquivo csv para code churn dos arquivos de v3

In [168]:
create_new_file_with_code_churn(hadoop_dir_testes_pydriller, 'hadoop-branch-3.1-churn', files_count_v3)

d in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/resources/LiveStatisticsRefresher.java,39

File hadoop-branch-3.1-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/resources/ResourceRefresher.java,31

File hadoop-branch-3.1-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/resources/ResourceSnapshotRefresher.java,40

File hadoop-branch-3.1-churn.csv updated in /Users/armandosoaressous

## 5.6.1 Filtrar apenas arquivos .java v3


In [169]:
filter_only_java_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-3.1-churn', 'hadoop-branch-3.1-churn-java')

resources/ContentCache.java,67

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.1-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/resources/LiveComponentsRefresher.java,39

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.1-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/web/rest/application/resources/LiveContainersRefresher.java,52

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.1-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-yarn-project/hadoop-yarn/

# 6.1 Lista dos arquivos modifica dos e respectiva complexidade para os arquivos de v4

In [170]:
list_of_modified_files_v4 = process_modified_files(my_repository_v4, from_commit_v4, to_commit_v4)

java  with a change type of ADD  and the complexity is 1
Author Wei-Chiu Chuang  modified Metrics.md  path hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md  path hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md  with a change type of MODIFY  and the complexity is None
Author Wei-Chiu Chuang  modified TestCoderBase.java  path hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/erasurecode/TestCoderBase.java  path hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/erasurecode/TestCoderBase.java  with a change type of MODIFY  and the complexity is 69
Author Wei-Chiu Chuang  modified TestDecodingValidator.java  path None  path hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/erasurecode/rawcoder/TestDecodingValidator.java  with a change type of ADD  and the complexity is 19
Author Wei-Chiu Chuang  modified TestRawCoderBase.java  path hadoop-common-project/hadoop-common/src/test/java/org/apache/hado

# 6.2 Criar arquivo csv para complexidade dos arquivos de v4

In [171]:
create_new_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-3.2-ccn', list_of_modified_files_v4)

essousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.2-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.2-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.2-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.2-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.2-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.2-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
File hadoop-branch-3.2-ccn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/security/IntermediateEncryptedStream.java,9

File hadoop-bra

# 6.3 Filtrar apenas os arquivos .java de v4

In [172]:
# 6.3 Filtrar apenas os arquivos .java de v4
filter_only_java_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-3.2-ccn', 'hadoop-branch-3.2-ccn-java')

oaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/viewfs/MountTableConfigLoader.java,0

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.2-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/viewfs/package-info.java,0

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.2-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/viewfs/TestHCFSMountTableConfigLoader.java,8

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.2-ccn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFSOverloadSchemeCentralMountTabl

# 6.4 Lista de métricas de code churn para os arquivos de v4

In [173]:
my_metrics_v4 = process_code_churn(my_repository_v4, from_commit_v4, to_commit_v4)

Processing code churn from /Users/armandosoaressousa/git/hadoop/hadoop-branch-3.2 from eb0b5a844f960017f6f48d746174d0f5826f0e5f to bc97dd0d267dd7b970e18c577ffdb749b819e15d ...
Complete in 0:03:38.949990!


In [174]:
files_count_v4, files_max_v4, files_avg_v4 = process_files_count_max_avg(my_metrics_v4)

p-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/driver/TestStateStoreZK.java': 53, 'hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/PeerCache.java': 0, 'hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/BlockReaderLocalLegacy.java': 0, 'hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/shortcircuit/ShortCircuitReplica.java': 0, 'hadoop-hdfs-project/hadoop-hdfs-nfs/src/main/java/org/apache/hadoop/hdfs/nfs/nfs3/OpenFileCtx.java': 0, 'hadoop-hdfs-project/hadoop-hdfs-nfs/src/main/java/org/apache/hadoop/hdfs/nfs/nfs3/OpenFileCtxCache.java': 0, 'hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HdfsDtFetcher.java': 1, 'hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/SWebHdfsDtFetcher.java': 1, 'hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/WebHdfsDtFetcher.java': 1, 'hadoop-hdfs-project/hadoop-hdfs/src/main/java/org

# 6.5 Total code churn for each file in v4

In [175]:
for key, value in files_count_v4.items():
    print(key, value)

dds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/HddsTestUtils.java 85
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java -14
hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/io/OzoneInputStream.java 5
hadoop-ozone/client/src/main/java/org/apache/hadoop/ozone/client/rpc/RpcClient.java -1
hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestKeyDeletingService.java 37
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/curator/ZKCuratorManager.java 6
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestZKCuratorManager.java 23
hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/driver/TestStateStoreZK.java 53
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/PeerCache.java 0
hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/BlockReaderLocalLega

# 6.6 Criar arquivo csv para code churn dos arquivos de v4

In [176]:
create_new_file_with_code_churn(hadoop_dir_testes_pydriller, 'hadoop-branch-3.2-churn', files_count_v4)

,1

File hadoop-branch-3.2-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestParallelReadUtil.java,0

File hadoop-branch-3.2-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPipelines.java,0

File hadoop-branch-3.2-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReadStripedFileWithMissingBlocks.java,0

File hadoop-branch-3.2-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReplaceDatanodeFailureReplication.java,0

File hadoop-branch-3.2-churn.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-hdfs-projec

# 6.6.1 Filtrar apenas arquivos .java v4

In [177]:
filter_only_java_file_with_complexity(hadoop_dir_testes_pydriller, 'hadoop-branch-3.2-churn', 'hadoop-branch-3.2-churn-java')

tes/pydriller/hadoop-branch-3.2-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java,0

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.2-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestSequentialBlockGroupId.java,0

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.2-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with success!
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestSequentialBlockId.java,1

File /Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.2-churn.csv.csv updated in /Users/armandosoaressousa/git/hadoop/testes/pydriller with su

# Variaveis auxiliares

In [186]:
hadoop_dir_testes_pydriller = '/Users/armandosoaressousa/git/hadoop' + '/' + 'testes' + '/' + 'pydriller' 

v1 = 'hadoop-branch-2.0.5'
v2 = 'hadoop-branch-3.0.0'
v3 = 'hadoop-branch-3.1'
v4 = 'hadoop-branch-3.2'

ccn_java = '-ccn-java'
churn_java = '-churn-java'
csv = '.csv'

ccn_java_files_v1 = hadoop_dir_testes_pydriller + '/' + v1 + ccn_java + csv
ccn_java_files_v2 = hadoop_dir_testes_pydriller + '/' + v2  + ccn_java + csv
ccn_java_files_v3 = hadoop_dir_testes_pydriller + '/' + v3  + ccn_java + csv
ccn_java_files_v4 = hadoop_dir_testes_pydriller + '/' + v4  + ccn_java + csv

churn_java_files_v1 = hadoop_dir_testes_pydriller + '/' + v1  + churn_java + csv
churn_java_files_v2 = hadoop_dir_testes_pydriller + '/' + v2  + churn_java + csv
churn_java_files_v3 = hadoop_dir_testes_pydriller + '/' + v3  + churn_java + csv
churn_java_files_v4 = hadoop_dir_testes_pydriller + '/' + v4  + churn_java + csv

print(ccn_java_files_v1)
print(churn_java_files_v1)
print(ccn_java_files_v2)
print(churn_java_files_v2)
print(ccn_java_files_v3)
print(churn_java_files_v3)
print(ccn_java_files_v4)
print(churn_java_files_v4)

/Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-2.0.5-ccn-java.csv
/Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-2.0.5-churn-java.csv
/Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.0.0-ccn-java.csv
/Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.0.0-churn-java.csv
/Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.1-ccn-java.csv
/Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.1-churn-java.csv
/Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.2-ccn-java.csv
/Users/armandosoaressousa/git/hadoop/testes/pydriller/hadoop-branch-3.2-churn-java.csv


# Dataframes with Cyclomatic Complexity

In [190]:
pd.set_option('display.max_colwidth',255)

df_hadoop_ccn_java_v1 = pd.read_csv(ccn_java_files_v1)
df_hadoop_ccn_java_v2 = pd.read_csv(ccn_java_files_v2)
df_hadoop_ccn_java_v3 = pd.read_csv(ccn_java_files_v3)
df_hadoop_ccn_java_v4 = pd.read_csv(ccn_java_files_v4)

# Dataframes with Code Churn

In [193]:
df_hadoop_churn_java_v1 = pd.read_csv(churn_java_files_v1)
df_hadoop_churn_java_v2 = pd.read_csv(churn_java_files_v2)
df_hadoop_churn_java_v3 = pd.read_csv(churn_java_files_v3)
df_hadoop_churn_java_v4 = pd.read_csv(churn_java_files_v4)

# Dataframes with Cyclomatic Complexity and Code Churn and Complexity X Churn 

In [196]:
df_hadoop_ccn_java_v1['codechurn'] = 0
df_hadoop_ccn_java_v2['codechurn'] = 0
df_hadoop_ccn_java_v3['codechurn'] = 0
df_hadoop_ccn_java_v4['codechurn'] = 0

# Daframe v1

In [202]:
df_hadoop_ccn_java_v1.head(3)

Unnamed: 0,file,complexity,codechurn
0,src/java/org/apache/hadoop/HadoopVersionAnnotation.java,0,0
1,src/java/org/apache/hadoop/conf/Configurable.java,0,0
2,src/java/org/apache/hadoop/conf/Configuration.java,185,0


In [203]:
print('processando dataframes ccn e churn ...')
a = datetime.datetime.now()
for i in range(0, df_hadoop_ccn_java_v1.shape[0]):
    valor_item_df1 = df_hadoop_ccn_java_v1.iloc[i,0]
    for j in range(0, df_hadoop_churn_java_v1.shape[0]):
        valor_item_df2 = df_hadoop_churn_java_v1.iloc[j, 0]
        valor_codechurn_df2 = df_hadoop_churn_java_v1.iloc[j, 1]
        if (valor_item_df1 == valor_item_df2):
            print('{} {} {}'.format(i, valor_item_df1, valor_codechurn_df2))
            df_hadoop_ccn_java_v1.iloc[i,2] = valor_codechurn_df2
b = datetime.datetime.now()
delta = b-a
print('Tempo de processamento: {}'.format(delta))

c/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/UpdatedContainerInfo.java 45
30174 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/UpdatedContainerInfo.java 45
30175 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/UpdatedContainerInfo.java 45
30176 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/UpdatedContainerInfo.java 45
30177 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/UpdatedContainerInfo.java 45
30178 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rm

In [216]:
df_hadoop_ccn_java_v1.replace(to_replace=['None'], value=0, inplace=True)

In [218]:
df_hadoop_ccn_java_v1['complexity'] = pd.to_numeric(df_hadoop_ccn_java_v1['complexity'])

In [220]:
df_hadoop_ccn_java_v1['ccXchurn'] = 0

In [221]:
df_hadoop_ccn_java_v1['ccXchurn'] = df_hadoop_ccn_java_v1['complexity'] * df_hadoop_ccn_java_v1['codechurn']

In [225]:
df_hadoop_ccn_java_v1.sort_index(by=['ccXchurn'], ascending=False, inplace=True)
df_hadoop_ccn_java_v1.head(10)



Unnamed: 0,file,complexity,codechurn,ccXchurn
5853,mapreduce/src/java/org/apache/hadoop/mapred/TaskTracker.java,570,4225,2408250
7783,mapreduce/src/java/org/apache/hadoop/mapred/JobInProgress.java.orig,571,3729,2129259
7781,mapreduce/src/java/org/apache/hadoop/mapred/JobInProgress.java.orig,261,3729,973269
7780,mapreduce/src/java/org/apache/hadoop/mapred/JobInProgress.java.orig,207,3729,771903
19303,hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/protobuf/HadoopRpcProtos.java,349,1963,685087
19424,hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/protobuf/HadoopRpcProtos.java,327,1963,641901
15602,hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java,468,1086,508248
19430,hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/protobuf/HadoopRpcProtos.java,252,1963,494676
19305,hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/protobuf/TestProtos.java,292,1525,445300
16310,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestTaskImpl.java,573,764,437772


In [321]:
df_hadoop_ccn_java_v1.to_csv('data/hadoop-branch-2.0.5-ccxchurn.csv', index=False)

# Dataframe V2

In [226]:
df_hadoop_ccn_java_v2.head(3)

Unnamed: 0,file,complexity,codechurn
0,hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/protocol/datatransfer/TestPacketReceiver.java,4,0
1,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestCompletedTask.java,1,0
2,dev-support/cmake-maven-ng-plugin/src/main/java/org/apache/hadoop/cmake/maven/ng/CompileMojo.java,25,0


In [229]:
print('processando dataframes ccn e churn ...')
a = datetime.datetime.now()
for i in range(0, df_hadoop_ccn_java_v2.shape[0]):
    valor_item_df1 = df_hadoop_ccn_java_v2.iloc[i,0]
    for j in range(0, df_hadoop_churn_java_v2.shape[0]):
        valor_item_df2 = df_hadoop_churn_java_v2.iloc[j, 0]
        valor_codechurn_df2 = df_hadoop_churn_java_v2.iloc[j, 1]
        if (valor_item_df1 == valor_item_df2):
            print('{} {} {}'.format(i, valor_item_df1, valor_codechurn_df2))
            df_hadoop_ccn_java_v2.iloc[i,2] = valor_codechurn_df2
b = datetime.datetime.now()
delta = b-a
print('Tempo de processamento: {}'.format(delta))

e/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/LeveldbConfigurationStore.java 322
6309 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ZKConfigurationStore.java 236
6310 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ConfigurationStoreBaseTest.java 96
6311 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/TestInMemoryConfigurationStore.java 30
6312 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/TestYarnConfigurationStore.java 71
6313 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-ya

In [231]:
df_hadoop_ccn_java_v2.replace(to_replace=['None'], value=0, inplace=True)

In [232]:
df_hadoop_ccn_java_v2['complexity'] = pd.to_numeric(df_hadoop_ccn_java_v2['complexity'])

In [233]:
df_hadoop_ccn_java_v2['ccXchurn'] = 0

In [234]:
df_hadoop_ccn_java_v2['ccXchurn'] = df_hadoop_ccn_java_v2['complexity'] * df_hadoop_ccn_java_v2['codechurn']

In [235]:
df_hadoop_ccn_java_v2.sort_index(by=['ccXchurn'], ascending=False, inplace=True)
df_hadoop_ccn_java_v2.head(10)



Unnamed: 0,file,complexity,codechurn,ccXchurn
5695,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/hamlet2/Hamlet.java,5589,30557,170783073
2100,hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/AzureNativeFileSystemStore.java,244,2941,717604
2105,hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java,212,3285,696420
997,hadoop-hdfs-project/hadoop-hdfs-nfs/src/main/java/org/apache/hadoop/hdfs/nfs/nfs3/RpcProgramNfs3.java,251,2320,582320
2619,hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java,155,2941,455855
4132,hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/OfflineImageReconstructor.java,245,1848,452760
3377,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-applicationhistoryservice/src/main/java/org/apache/hadoop/yarn/server/timeline/RollingLevelDBTimelineStore.java,227,1828,414956
1247,hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/http/SwiftRestClient.java,167,1878,313626
4067,hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/FoldedTreeSet.java,244,1285,313540
5188,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase-tests/src/test/java/org/apache/hadoop/yarn/server/timelineservice/storage/TestHBaseTimelineStorageApps.java,138,1941,267858


In [313]:
df_hadoop_ccn_java_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6451 entries, 5695 to 3251
Data columns (total 4 columns):
file          6451 non-null object
complexity    6451 non-null int64
codechurn     6451 non-null int64
ccXchurn      6451 non-null int64
dtypes: int64(3), object(1)
memory usage: 252.0+ KB


In [314]:
df_hadoop_ccn_java_v2.describe()

Unnamed: 0,complexity,codechurn,ccXchurn
count,6451.0,6451.0,6451.0
mean,11.437762,163.535421,30931.88
std,71.838091,454.407824,2126402.0
min,0.0,-208.0,-9956.0
25%,0.0,34.0,0.0
50%,4.0,82.0,196.0
75%,13.0,183.0,1850.0
max,5589.0,30557.0,170783100.0


In [322]:
df_hadoop_ccn_java_v2.to_csv('data/hadoop-branch-3.0.0-ccxchurn.csv', index=False)

# Dataframe V3

In [236]:
df_hadoop_ccn_java_v3.head(3)

Unnamed: 0,file,complexity,codechurn
0,hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ErasureCodingPolicyInfo.java,14.0,0
1,hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/protocol/TestErasureCodingPolicyInfo.java,4.0,0
2,hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsLocatedFileStatus.java,,0


In [239]:
print('processando dataframes ccn e churn ...')
a = datetime.datetime.now()
for i in range(0, df_hadoop_ccn_java_v3.shape[0]):
    valor_item_df1 = df_hadoop_ccn_java_v3.iloc[i,0]
    for j in range(0, df_hadoop_churn_java_v3.shape[0]):
        valor_item_df2 = df_hadoop_churn_java_v3.iloc[j, 0]
        valor_codechurn_df2 = df_hadoop_churn_java_v3.iloc[j, 1]
        if (valor_item_df1 == valor_item_df2):
            print('{} {} {}'.format(i, valor_item_df1, valor_codechurn_df2))
            df_hadoop_ccn_java_v3.iloc[i,2] = valor_codechurn_df2
b = datetime.datetime.now()
delta = b-a
print('Tempo de processamento: {}'.format(delta))

emanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/preprocessor/package-info.java 28
2094 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/preprocessor/TestContextProcessor.java 63
2095 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/preprocessor/TestNodeLabelProcessor.java 45
2096 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/preprocessor/TestQueueProcessor.java 43
2097 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/preprocessor/TestTagAddProcessor.java 47
2098 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yar

In [240]:
df_hadoop_ccn_java_v3.head()

Unnamed: 0,file,complexity,codechurn
0,hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ErasureCodingPolicyInfo.java,14.0,106
1,hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/protocol/TestErasureCodingPolicyInfo.java,4.0,72
2,hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsLocatedFileStatus.java,,220
3,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/hadoop/security/KerberosDiags.java,62.0,0
4,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/Slider.java,1.0,52


In [241]:
df_hadoop_ccn_java_v3.replace(to_replace=['None'], value=0, inplace=True)

In [242]:
df_hadoop_ccn_java_v3['complexity'] = pd.to_numeric(df_hadoop_ccn_java_v3['complexity'])

In [243]:
df_hadoop_ccn_java_v3['ccXchurn'] = 0

In [244]:
df_hadoop_ccn_java_v3['ccXchurn'] = df_hadoop_ccn_java_v3['complexity'] * df_hadoop_ccn_java_v3['codechurn']

In [245]:
df_hadoop_ccn_java_v3.sort_index(by=['ccXchurn'], ascending=False, inplace=True)
df_hadoop_ccn_java_v3.head(10)



Unnamed: 0,file,complexity,codechurn,ccXchurn
17,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/api/proto/Messages.java,5855,34473,201839415
31,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/client/SliderClient.java,553,2783,1538999
197,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java,470,2850,1339500
301,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java,247,2120,523640
19,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/api/proto/SliderClusterAPI.java,204,2293,467772
243,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java,185,2138,395530
1982,hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterClientProtocol.java,208,1816,377728
598,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-registry/src/main/java/org/apache/hadoop/registry/server/dns/RegistryDNS.java,172,1800,309600
757,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/test/java/org/apache/slider/utils/SliderTestUtils.java,135,1065,143775
316,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java,125,1123,140375


In [323]:
df_hadoop_ccn_java_v3.to_csv('data/hadoop-branch-3.1-ccxchurn.csv', index=False)

# Dataframe V4

In [246]:
df_hadoop_ccn_java_v4.head(3)

Unnamed: 0,file,complexity,codechurn
0,hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipelines/PipelineActionEventHandler.java,4,0
1,hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipelines/PipelineCloseHandler.java,2,0
2,hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestNodeFailure.java,4,0


In [249]:
print('processando dataframes ccn e churn ...')
a = datetime.datetime.now()
for i in range(0, df_hadoop_ccn_java_v4.shape[0]):
    valor_item_df1 = df_hadoop_ccn_java_v4.iloc[i,0]
    for j in range(0, df_hadoop_churn_java_v4.shape[0]):
        valor_item_df2 = df_hadoop_churn_java_v4.iloc[j, 0]
        valor_codechurn_df2 = df_hadoop_churn_java_v4.iloc[j, 1]
        if (valor_item_df1 == valor_item_df2):
            print('{} {} {}'.format(i, valor_item_df1, valor_codechurn_df2))
            df_hadoop_ccn_java_v4.iloc[i,2] = valor_codechurn_df2
b = datetime.datetime.now()
delta = b-a
print('Tempo de processamento: {}'.format(delta))

tusDefault.java 57
1378 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ProxyCombiner.java 151
1379 hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/BalancerProtocols.java 30
1380 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/WorkflowPriorityMappingsManager.java 230
1381 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerWorkflowPriorityMapping.java 151
1382 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ConfigVersionInfo.java 44
1383 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager

In [324]:
df_hadoop_ccn_java_v4.head()

Unnamed: 0,file,complexity,codechurn,ccXchurn
12,hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/RandomKeyGenerator.java,110,1039,114290
122,hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystem.java,81,1221,98901
1405,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/v2/TestSpeculativeExecOnCluster.java,89,935,83215
289,hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFilesystemAcl.java,53,1410,74730
46,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/nodelabels/NodeAttributesManagerImpl.java,70,752,52640


In [251]:
df_hadoop_ccn_java_v4.replace(to_replace=['None'], value=0, inplace=True)

In [252]:
df_hadoop_ccn_java_v4['complexity'] = pd.to_numeric(df_hadoop_ccn_java_v4['complexity'])

In [253]:
df_hadoop_ccn_java_v4['ccXchurn'] = 0

In [254]:
df_hadoop_ccn_java_v4['ccXchurn'] = df_hadoop_ccn_java_v4['complexity'] * df_hadoop_ccn_java_v4['codechurn']

In [255]:
df_hadoop_ccn_java_v4.sort_index(by=['ccXchurn'], ascending=False, inplace=True)
df_hadoop_ccn_java_v4.head(10)



Unnamed: 0,file,complexity,codechurn,ccXchurn
12,hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/RandomKeyGenerator.java,110,1039,114290
122,hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystem.java,81,1221,98901
1405,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/v2/TestSpeculativeExecOnCluster.java,89,935,83215
289,hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFilesystemAcl.java,53,1410,74730
46,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/nodelabels/NodeAttributesManagerImpl.java,70,752,52640
31,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/NodeAttributesCLI.java,54,718,38772
1433,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/webapp/TestHsWebServicesLogs.java,49,780,38220
1469,hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestDirectoryMarkerListing.java,49,774,37926
183,hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/SharedKeyCredentials.java,64,510,32640
1250,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/timeline/TimelineEntityV2Converter.java,71,449,31879


In [325]:
df_hadoop_ccn_java_v4.to_csv('data/hadoop-branch-3.2-ccxchurn.csv', index=False)

# Limpar diretórios

In [303]:
df_hadoop_ccn_java_v1.head(3)

Unnamed: 0,file,complexity,codechurn,ccXchurn
5853,mapreduce/src/java/org/apache/hadoop/mapred/TaskTracker.java,570,4225,2408250
7783,mapreduce/src/java/org/apache/hadoop/mapred/JobInProgress.java.orig,571,3729,2129259
7781,mapreduce/src/java/org/apache/hadoop/mapred/JobInProgress.java.orig,261,3729,973269


In [312]:
df_hadoop_ccn_java_v1.tail(10)

Unnamed: 0,file,complexity,codechurn,ccXchurn
7220,common/src/java/org/apache/hadoop/fs/TrashPolicyDefault.java,49,0,0
7215,common/src/java/org/apache/hadoop/fs/TrashPolicyDefault.java,26,0,0
7216,common/src/java/org/apache/hadoop/fs/TrashPolicyDefault.java,0,0,0
14966,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/main/java/org/apache/hadoop/mapred/NotRunningJob.java,0,241,0
14965,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/main/java/org/apache/hadoop/mapred/NotRunningJob.java,0,241,0
7217,common/src/java/org/apache/hadoop/fs/TrashPolicyDefault.java,209,0,0
14963,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/main/java/org/apache/hadoop/mapred/NotRunningJob.java,0,241,0
7218,common/src/java/org/apache/hadoop/fs/TrashPolicyDefault.java,2,0,0
7219,common/src/java/org/apache/hadoop/fs/TrashPolicyDefault.java,0,0,0
0,src/java/org/apache/hadoop/HadoopVersionAnnotation.java,0,0,0


In [304]:
list_files_v1 = list(df_hadoop_ccn_java_v1.file.unique())
df_hadoop_ccn_java_v1_unique_files = pd.DataFrame(list_files_v1, columns=['file'])
df_hadoop_ccn_java_v1_unique_files['complexity'] = 0 
df_hadoop_ccn_java_v1_unique_files['codechurn'] = 0
df_hadoop_ccn_java_v1_unique_files['ccXchurn'] = 0
df_hadoop_ccn_java_v1_unique_files.head(3)

Unnamed: 0,file,complexity,codechurn,ccXchurn
0,mapreduce/src/java/org/apache/hadoop/mapred/TaskTracker.java,0,0,0
1,mapreduce/src/java/org/apache/hadoop/mapred/JobInProgress.java.orig,0,0,0
2,hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/protobuf/HadoopRpcProtos.java,0,0,0


In [306]:
# A performance desse algoritmo esta ruim (na ordem de 3 horas para concluir a varredura de 5000 x 10000) ...
def tenta_remover_linhas_repetidas_v1():
    print('atualizado df_hadoop_ccn_java_v1_unique_files ccn e churn ...')
    a = datetime.datetime.now()
    for i in range(0, df_hadoop_ccn_java_v1_unique_files.shape[0]):
        valor_item_df1 = df_hadoop_ccn_java_v1_unique_files.iloc[i,0]
        lista_temp = []
        for j in range(0, df_hadoop_ccn_java_v1.shape[0]):
            valor_item_df2 = df_hadoop_ccn_java_v1.iloc[j, 0]
            valor_complexity_df2 = df_hadoop_ccn_java_v1.iloc[j, 1]
            valor_codechurn_df2 = df_hadoop_ccn_java_v1.iloc[j, 2]        
            valor_ccXchurn_df2 = df_hadoop_ccn_java_v1.iloc[j, 3]
            if (valor_item_df1 == valor_item_df2) and (valor_item_df1 not in lista_temp):
                lista_temp.append(valor_item_df1)
                print('i: {}, j: {}, {} {} {} {}'.format(i, j, valor_item_df1, valor_complexity_df2, valor_codechurn_df2, valor_ccXchurn_df2))
                df_hadoop_ccn_java_v1_unique_files.iloc[i,1] = valor_complexity_df2
                df_hadoop_ccn_java_v1_unique_files.iloc[i,2] = valor_codechurn_df2
                df_hadoop_ccn_java_v1_unique_files.iloc[i,3] = valor_ccXchurn_df2
            else:
                continue
    b = datetime.datetime.now()
    delta = b-a
    print('Tempo de processamento: {}'.format(delta))

In [326]:
# E preciso testar antes de usar em producao
# A performance ainda esta ruim.... pois usa um for forca bruta...
def atualizalinhas_df_cloc(df_principal, df_secundario):
    print('atualiza o valor do df_secundario com os valores de complexidade e churn do df_principal...')
    a = datetime.datetime.now()
    for i in range(0, df_principal.shape[0]):
        valor_arquivo_df1 = df_principal.iloc[i,0]
        valor_arquivo_df1 = str(valor_arquivo_df1)
        for j in range(0, df_secundario.shape[0]):
            valor_arquivo_df2 = df_secundario.iloc[j, 0]
            valor_arquivo_df2 = str(valor_arquivo_df2)
            if (valor_arquivo_df1 in valor_arquivo_df2):
                valor_complexity_df2 = df_secundario.iloc[j, 1]
                valor_codechurn_df2 = df_secundario.iloc[j, 2]  
                valor_ccXchurn_df2 = df_secundario.iloc[j, 3]      
                print('i: {}, j: {}, {} {} {} {}'.format(i, j, valor_arquivo_df1, valor_complexity_df2, valor_codechurn_df2, valor_ccXchurn_df2))
                print('---')
    b = datetime.datetime.now()
    delta = b-a
    print('Tempo de processamento: {}'.format(delta))

# Dataframes com complexidade x code churn atualizados

In [328]:
df_hadoop_ccn_java_v1.head(10)

Unnamed: 0,file,complexity,codechurn,ccXchurn
5853,mapreduce/src/java/org/apache/hadoop/mapred/TaskTracker.java,570,4225,2408250
7783,mapreduce/src/java/org/apache/hadoop/mapred/JobInProgress.java.orig,571,3729,2129259
7781,mapreduce/src/java/org/apache/hadoop/mapred/JobInProgress.java.orig,261,3729,973269
7780,mapreduce/src/java/org/apache/hadoop/mapred/JobInProgress.java.orig,207,3729,771903
19303,hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/protobuf/HadoopRpcProtos.java,349,1963,685087
19424,hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/protobuf/HadoopRpcProtos.java,327,1963,641901
15602,hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java,468,1086,508248
19430,hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/protobuf/HadoopRpcProtos.java,252,1963,494676
19305,hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/protobuf/TestProtos.java,292,1525,445300
16310,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestTaskImpl.java,573,764,437772


In [329]:
df_hadoop_ccn_java_v2.head(10)

Unnamed: 0,file,complexity,codechurn,ccXchurn
5695,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/hamlet2/Hamlet.java,5589,30557,170783073
2100,hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/AzureNativeFileSystemStore.java,244,2941,717604
2105,hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java,212,3285,696420
997,hadoop-hdfs-project/hadoop-hdfs-nfs/src/main/java/org/apache/hadoop/hdfs/nfs/nfs3/RpcProgramNfs3.java,251,2320,582320
2619,hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java,155,2941,455855
4132,hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/OfflineImageReconstructor.java,245,1848,452760
3377,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-applicationhistoryservice/src/main/java/org/apache/hadoop/yarn/server/timeline/RollingLevelDBTimelineStore.java,227,1828,414956
1247,hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/http/SwiftRestClient.java,167,1878,313626
4067,hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/FoldedTreeSet.java,244,1285,313540
5188,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase-tests/src/test/java/org/apache/hadoop/yarn/server/timelineservice/storage/TestHBaseTimelineStorageApps.java,138,1941,267858


In [330]:
df_hadoop_ccn_java_v3.head(10)

Unnamed: 0,file,complexity,codechurn,ccXchurn
17,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/api/proto/Messages.java,5855,34473,201839415
31,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/client/SliderClient.java,553,2783,1538999
197,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java,470,2850,1339500
301,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java,247,2120,523640
19,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/api/proto/SliderClusterAPI.java,204,2293,467772
243,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java,185,2138,395530
1982,hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterClientProtocol.java,208,1816,377728
598,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-registry/src/main/java/org/apache/hadoop/registry/server/dns/RegistryDNS.java,172,1800,309600
757,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/test/java/org/apache/slider/utils/SliderTestUtils.java,135,1065,143775
316,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-slider/hadoop-yarn-slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java,125,1123,140375


In [331]:
df_hadoop_ccn_java_v4.head(10)

Unnamed: 0,file,complexity,codechurn,ccXchurn
12,hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/freon/RandomKeyGenerator.java,110,1039,114290
122,hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystem.java,81,1221,98901
1405,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/v2/TestSpeculativeExecOnCluster.java,89,935,83215
289,hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFilesystemAcl.java,53,1410,74730
46,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/nodelabels/NodeAttributesManagerImpl.java,70,752,52640
31,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/NodeAttributesCLI.java,54,718,38772
1433,hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/webapp/TestHsWebServicesLogs.java,49,780,38220
1469,hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestDirectoryMarkerListing.java,49,774,37926
183,hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/SharedKeyCredentials.java,64,510,32640
1250,hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/timeline/TimelineEntityV2Converter.java,71,449,31879
