In [1]:
%load_ext autoreload
%autoreload 2

import sys
print('Python %s on %s' % (sys.version, sys.platform))
sys.path.extend(['/mnt/c/Users/Yuriy Rogachev/PycharmProjects/code duplication detection', '/mnt/c/Users/Yuriy Rogachev/PycharmProjects/code duplication detection'])

import os
from pathlib import Path
from typing import *

cwd = os.getcwd()
test_dir = Path(cwd).parent/"duplication"/"test_data"

Python 3.8.5 (default, Jan 27 2021, 15:41:15) 
[GCC 9.3.0] on linux


In [2]:
import os
import functools
from enum import Enum
from dataclasses import dataclass

# Base algorithm

In [4]:
from duplication.run import *
from duplication.similarity_metrics import jaccard
from duplication.detectors import NaiveDetector

In [None]:
@dataclass
class DetectionResult:
    """
    Class for storing results of detection
    """
    clones: List[Tuple[EntityData, EntityData, float]]


class EntityTypes(Enum):
    Function = 0
    Class = 1


@dataclass
class EntityData:
    entity_type: EntityTypes
    path: str
    lang: str
    bag_of_tokens: List[str]



File = Tuple[str, str]


class Detector:
    @staticmethod
    def extract_entity_from_object_data(obj: ObjectData, path, lang) -> EntityData:
        entity_type = ""
        if obj.object_type == ObjectTypes.FUNCTION:
            entity_type = "function"
        if obj.object_type == ObjectTypes.CLASS:
            entity_type = "class"

        entity_tokens = get_identifiers_sequence_from_code(obj.content, lang)
        return EntityData(entity_type, path, lang, entity_tokens)
    
    @staticmethod
    def extract_entity_from_file_data(file: FileData) -> EntityData:
        entity_path = file_data.path
        entity_lang = file_data.lang
        return [Detector.extract_entity_from_object_data(obj, entity_path, entity_lang) for obj in file_data.objects]
    
    @staticmethod
    def extract_entities_from_files_data(files: List[FileData]) -> List[EntityData]:
        def acc(value: List[EntityData], file_data: FileData) -> List[EntityData]:
            value.extend(Detector.extract_entity_from_file_data(file_data))
            return value
        return functools.reduce(acc, files, [])
    
    @staticmethod
    def extract_data_from_files(files: List[File]) -> List[FileData]:
        return [get_data_from_file(file, lang, True, False) for file, lang in files]
        
    def fill_fields(self, directory: str, granularity: str):
        lang2files = recognize_languages_dir(directory)
        files = transform_files_list(lang2files, granularity, None)
        self.files = [(get_full_path(file, directory), lang) for file, lang in files]
        self.files_data = self.extract_data_from_files(self.files)
        self.entities = self.extract_entities_from_files_data(self.files_data)
    
    def detect(self, directory: str, granularity: str) -> List[DetectionResult]:
        raise NotImplemented
        
        
class NaiveDetector(Detector):
    def detect(self, directory: str, threshold: float, granularity: str) -> DetectionResult:
        self.fill_fields(directory, granularity)
        clones = []
        for i in range(len(self.entities)):
            entity = self.entities[i]
            for j in range(i + 1, len(self.entities)):
                candidate = self.entities[j]
                if entity == candidate or entity.lang != candidate.lang:
                    continue
                
                sim = jaccard(entity.bag_of_tokens, candidate.bag_of_tokens)
                if sim > threshold:
                    clones.append((entity, candidate, sim))
        return DetectionResult(clones)
                        

In [5]:
naive_detector = NaiveDetector()
clones = naive_detector.detect(test_dir.parent, 0.65, "functions").clones

In [6]:
clones

[(EntityData(entity_type=<EntityTypes.Class: 1>, path='/mnt/c/Users/Yuriy Rogachev/PycharmProjects/code duplication detection/duplication/test_data/test_file.java', lang='Java', bag_of_tokens=['HelloWorld', 'main', 'String', 'args', 'System', 'out', 'println', 'SomeFunction', 'a', 'System', 'out', 'println', 'a']),
  EntityData(entity_type=<EntityTypes.Function: 0>, path='/mnt/c/Users/Yuriy Rogachev/PycharmProjects/code duplication detection/duplication/test_data/test_file.java', lang='Java', bag_of_tokens=['main', 'String', 'args', 'System', 'out', 'println']),
  0.6923076923076923),
 (EntityData(entity_type=<EntityTypes.Class: 1>, path='/mnt/c/Users/Yuriy Rogachev/PycharmProjects/code duplication detection/duplication/test_data/test_file.java', lang='Java', bag_of_tokens=['HelloWorld', 'main', 'String', 'args', 'System', 'out', 'println', 'SomeFunction', 'a', 'System', 'out', 'println', 'a']),
  EntityData(entity_type=<EntityTypes.Function: 0>, path='/mnt/c/Users/Yuriy Rogachev/Pycha

In [None]:
file_data = get_data_from_file(test_dir/"test_file.java", "Java", True, True)

In [None]:
dir(file_data)

In [None]:
file_data

In [None]:
[obj for obj in file_data.objects if obj.object_type == ObjectTypes.FUNCTION]