In [1]:
import os
import shutil
import hashlib
import re

## De-duplicate
#### Find Duplicate Assignments

In [2]:
RAW_PATH = "data/raw"

def generate_hashes(start_path):
    hashes = {}

    for root, _, files in os.walk(start_path):
        for file in files:
            if not file.endswith((".jar", ".zip")):
                continue

            with open(os.path.join(root, file), "rb") as f:
                digest = hashlib.file_digest(f, "sha256")

            dhash = digest.hexdigest()

            if dhash in hashes.keys():
                hashes[dhash].append(os.path.join(root, file))
            else:
                hashes[dhash] = [os.path.join(root, file)]

    return hashes



In [3]:
results = generate_hashes(RAW_PATH)

#### Stats after grouping duplicates

In [4]:
def stats(hashes):
    print("Number of submission: ", sum([len(p) for p in hashes.values()]))
    print("Number of unique solutions:", len(hashes))
    print("Number of duplicates: ", sum([len(p) for p in hashes.values()]) - len(hashes))
    print("Max number of duplicates on a specific solution", max([len(p) for p in hashes.values()]))

In [5]:
stats(results)

Number of submission:  1390
Number of unique solutions: 927
Number of duplicates:  463
Max number of duplicates on a specific solution 3


#### Copy jars to processed, de-duplicate, and rename

In [6]:
PROCESSED_PATH = "data/proc"
DATE_REGEX = r'[0-9]+~[0-9]+'

if not os.path.exists(PROCESSED_PATH):
    os.mkdir(PROCESSED_PATH)

for counter, paths in enumerate(results.values()):

    regex_lst = re.findall(DATE_REGEX, paths[0])

    if len(regex_lst) == 1:
        date = regex_lst[0]
    else:
        date = "21~22"

    if not os.path.exists(os.path.join(PROCESSED_PATH, date)):
        os.mkdir(os.path.join(PROCESSED_PATH, date))

    name = date + "_Submission_" + str(counter)

    proc_path = os.path.join(PROCESSED_PATH, date, name)

    if not os.path.exists(proc_path):
        os.mkdir(proc_path)

        file_ext = paths[0].split(".")

        if len(file_ext) == 1:
            ext = ".jar"
        else:
            ext = "." + file_ext[1]

        new_path = name + ext

        shutil.copyfile(paths[0], os.path.join(proc_path, new_path))

## File Processing
### Extract files for jar

In [7]:
for root, _, files in os.walk(PROCESSED_PATH):
    for file in files:
        if file.endswith(".jar") or file.endswith("*.zip"):
            path = os.path.join(root, file).replace(" ", '\ ')
            os.system('unzip ' + path + " -d ./" + root.replace(" ", "\ "))

            os.remove(path)

Archive:  data/proc/20~21/20~21_Submission_855/20~21_Submission_855.jar
  inflating: ./data/proc/20~21/20~21_Submission_855/META-INF/MANIFEST.MF  
  inflating: ./data/proc/20~21/20~21_Submission_855/Animal.class  
  inflating: ./data/proc/20~21/20~21_Submission_855/Animal.ctxt  
  inflating: ./data/proc/20~21/20~21_Submission_855/Animal.java  
  inflating: ./data/proc/20~21/20~21_Submission_855/Counter.class  
  inflating: ./data/proc/20~21/20~21_Submission_855/Counter.ctxt  
  inflating: ./data/proc/20~21/20~21_Submission_855/Counter.java  
  inflating: ./data/proc/20~21/20~21_Submission_855/Deer.class  
  inflating: ./data/proc/20~21/20~21_Submission_855/Deer.ctxt  
  inflating: ./data/proc/20~21/20~21_Submission_855/Deer.java  
  inflating: ./data/proc/20~21/20~21_Submission_855/Disease.class  
  inflating: ./data/proc/20~21/20~21_Submission_855/Disease.ctxt  
  inflating: ./data/proc/20~21/20~21_Submission_855/Disease.java  
  inflating: ./data/proc/20~21/20~21_Submission_855/Field

### Remove unnecessary files

In [8]:
for root, subdirs, files in os.walk(PROCESSED_PATH):

    for subdir in subdirs:
        if subdir.startswith((".", "_")) or subdir == "doc" or subdir == 'META-INF' or subdir == 'out':
            shutil.rmtree(os.path.join(root, subdir))

    for file in files:
        if file.lower().endswith((".class", ".ctxt", "~", ".", ".pdf")):
            os.remove(os.path.abspath(os.path.join("./", root, file)))

## Anonymisation
### Remove @author lines from all files

In [9]:
AUTHOR_TAG_REGEX = '\s*\*\s*@author.+'

for root, _, files in os.walk(PROCESSED_PATH):
    for file in files:
        if file.endswith(".java"):
            with open(os.path.join(root, file), "r", encoding="ISO-8859-1") as f:
                source_code = f.read()

            source_code = re.sub(AUTHOR_TAG_REGEX, "", source_code)

            with open(os.path.join(root, file), "w", encoding='utf-8') as f:
                f.write(source_code)


## De-Duplication Round 2
### Remove comments and minify

In [18]:
DUPLICATION_PATH = 'data/dup'
WHITESPACE_COMMENT_REGEX = '([\/|\*]\s*.+)|\n|\s'

if not os.path.exists(DUPLICATION_PATH):
    shutil.copytree(PROCESSED_PATH, DUPLICATION_PATH)


for root, _, files in os.walk(DUPLICATION_PATH):
    for file in files:
        if file.endswith(".java"):
            with open(os.path.join(root, file), "r") as f:
                source_code = f.read()

            source_code = re.sub(WHITESPACE_COMMENT_REGEX, "", source_code)

            with open(os.path.join(root, file), "w") as f:
                f.write(source_code)

### Zip directories

In [20]:
for item_base in os.listdir(DUPLICATION_PATH):
    path = os.path.join(DUPLICATION_PATH, item_base)
    if os.path.isdir(path):
        for subdir in os.listdir(path):
            sub_path = os.path.join(path, subdir)
            if os.path.isdir(sub_path):
                os.system('zip -r ' + sub_path + ".zip " + sub_path)

  adding: data/dup/20~21/20~21_Submission_855/ (stored 0%)
  adding: data/dup/20~21/20~21_Submission_855/Rabbit.java (deflated 59%)
  adding: data/dup/20~21/20~21_Submission_855/Mouse.java (deflated 63%)
  adding: data/dup/20~21/20~21_Submission_855/Disease.java (deflated 27%)
  adding: data/dup/20~21/20~21_Submission_855/Randomizer.java (deflated 48%)
  adding: data/dup/20~21/20~21_Submission_855/Weather.java (deflated 60%)
  adding: data/dup/20~21/20~21_Submission_855/Time.java (deflated 38%)
  adding: data/dup/20~21/20~21_Submission_855/Plant.java (deflated 58%)
  adding: data/dup/20~21/20~21_Submission_855/Counter.java (deflated 44%)
  adding: data/dup/20~21/20~21_Submission_855/Deer.java (deflated 64%)
  adding: data/dup/20~21/20~21_Submission_855/SimulatorView.java (deflated 63%)
  adding: data/dup/20~21/20~21_Submission_855/Simulator.java (deflated 68%)
  adding: data/dup/20~21/20~21_Submission_855/FieldStats.java (deflated 62%)
  adding: data/dup/20~21/20~21_Submission_855/Loca

### Rerun de-duplication with new hashes

In [21]:
results = generate_hashes(DUPLICATION_PATH)
stats(results)

Number of submission:  984
Number of unique solutions: 981
Number of duplicates:  3
Max number of duplicates on a specific solution 2


In [22]:
results

{'7f6161d8641e75e2ae880ffc12c2d6528b4d294921f221515e2bec3831fa5115': ['data/dup/20~21/20~21_Submission_921.zip'],
 '0f2555c84d5554950c378cc5db870f1a0f2d3484a305529bbbbc6d37be3c4080': ['data/dup/20~21/20~21_Submission_909.zip'],
 'cdf3a95b4320faa908c0a18cab88c5a4691de344ba30e6be0b91ba5e7f75ab9c': ['data/dup/20~21/20~21_Submission_712.zip'],
 'b212ebe67d2298ce295e00605a7b3b914066bf3d4ce476423ed556bfd33e3c5b': ['data/dup/20~21/20~21_Submission_706.zip'],
 '73f5bc32b1c89d78163b2b956ea20050f6c24fdd4178ece42214bdd33088460d': ['data/dup/20~21/20~21_Submission_841.zip'],
 '03b3b3c0971d66f25deb6767edb970efd5e9ec9b385a14066ada3f69c6ca54e7': ['data/dup/20~21/20~21_Submission_699.zip'],
 '2e9851417d2e1edf10cdeccff3983c12cd968566547cd194ee216ecc5152586b': ['data/dup/20~21/20~21_Submission_855.zip'],
 'd9da5585c15d500a460f1aea130c0ac8f19d7e45b66f20e4a0b2a285ff98d180': ['data/dup/20~21/20~21_Submission_869.zip'],
 '295c29265b14591ce2d70c9fb63f2614b9155ea9ab2a28b04741281785e6cc91': ['data/dup/20~21/20

### Remove duplication directory

In [23]:
shutil.rmtree(DUPLICATION_PATH)