In [85]:
import os
import shutil
import hashlib
import re

## De-duplicate
#### Find Duplicate Assignments

In [86]:
RAW_PATH = "data/raw"

hashes = {}

for root, _, files in os.walk(RAW_PATH):
    for file in files:
        if file.startswith("."):
            continue

        with open(os.path.join(root, file), "rb") as f:
            digest = hashlib.file_digest(f, "sha256")

        dhash = digest.hexdigest()

        if dhash in hashes.keys():
            hashes[dhash].append(os.path.join(root, file))
        else:
            hashes[dhash] = [os.path.join(root, file)]



#### Stats after grouping duplicates

In [93]:
print("Number of submission: ", sum([len(p) for p in hashes.values()]))
print("Number of unique solutions:", len(hashes))
print("Number of duplicates: ", sum([len(p) for p in hashes.values()]) - len(hashes))
print("Max number of duplicates on a specific solution", max([len(p) for p in hashes.values()]))

Number of submission:  1404
Number of unique solutions: 939
Number of duplicates:  465
Max number of duplicates on a specific solution 3


#### Copy jars to processed, de-duplicate, and rename

In [105]:
PROCESSED_PATH = "data/proc"
DATE_REGEX = r'[0-9]+~[0-9]+'

if not os.path.exists(PROCESSED_PATH):
    os.mkdir(PROCESSED_PATH)

for counter, paths in enumerate(hashes.values()):

    regex_lst = re.findall(DATE_REGEX, paths[0])


    if len(regex_lst) == 1:
        date = regex_lst[0]
    else:
        date = "21~22"

    if not os.path.exists(os.path.join(PROCESSED_PATH, date)):
        os.mkdir(os.path.join(PROCESSED_PATH, date))

    name = date + "_Submission_" + str(counter)

    proc_path = os.path.join(PROCESSED_PATH, date, name)

    if not os.path.exists(proc_path):
        os.mkdir(proc_path)

        file_ext = paths[0].split(".")

        if len(file_ext) == 1:
            ext = ".jar"
        else:
            ext = "." + file_ext[1]

        new_path = name + ext

        shutil.copyfile(paths[0], os.path.join(proc_path, new_path))

## File Processing
### Extract files for jar

In [109]:

temp = False

for root, _, files in os.walk(PROCESSED_PATH):
    for file in files:
        if file.endswith(".jar"):
            path = os.path.join(root, file).replace(" ", '\ ')
            os.system('unzip ' + path + " -d ./" + root.replace(" ", "\ "))

            os.remove(path)
            temp = True
            break

    if temp:
        break

Archive:  data/proc/20~21/20~21_Submission_855/20~21_Submission_855.jar
  inflating: ./data/proc/20~21/20~21_Submission_855/META-INF/MANIFEST.MF  
  inflating: ./data/proc/20~21/20~21_Submission_855/.DS_Store  
  inflating: ./data/proc/20~21/20~21_Submission_855/Actor.class  
  inflating: ./data/proc/20~21/20~21_Submission_855/Actor.ctxt  
  inflating: ./data/proc/20~21/20~21_Submission_855/Actor.java  
  inflating: ./data/proc/20~21/20~21_Submission_855/Animal.class  
  inflating: ./data/proc/20~21/20~21_Submission_855/Animal.ctxt  
  inflating: ./data/proc/20~21/20~21_Submission_855/Animal.java  
  inflating: ./data/proc/20~21/20~21_Submission_855/Cheetah.class  
  inflating: ./data/proc/20~21/20~21_Submission_855/Cheetah.ctxt  
  inflating: ./data/proc/20~21/20~21_Submission_855/Cheetah.java  
  inflating: ./data/proc/20~21/20~21_Submission_855/Counter.class  
  inflating: ./data/proc/20~21/20~21_Submission_855/Counter.ctxt  
  inflating: ./data/proc/20~21/20~21_Submission_855/Count

### Remove unnecessary files

In [111]:
for root, subdirs, files in os.walk(PROCESSED_PATH):

    for subdir in subdirs:
        if subdir.startswith(".") or subdir == "doc":
            shutil.rmtree(os.path.join(root, subdir))

    for file in files:
        if file.endswith(".class") or file.endswith(".ctxt") or file.endswith("~") or file.startswith("."):
            os.remove(os.path.abspath(os.path.join("./", root, file)))

## Anonymisation
### Remove @author lines from all files

In [133]:
AUTHOR_TAG_REGEX = '\s*\*\s*@author.+'

for root, _, files in os.walk(PROCESSED_PATH):
    for file in files:
        if file.endswith(".java"):
            with open(os.path.join(root, file), "r") as f:
                source_code = f.read()

            source_code = re.sub(AUTHOR_TAG_REGEX, "", source_code)

            with open(os.path.join(root, file), "w") as f:
                f.write(source_code)
