In [None]:
import os
import shutil
import hashlib
import re

# Assignment Processor - Anonymization and Deduplication
## Deduplication
### Find duplicates in reports

In [None]:
RAW_PATH = "data/reports"
NAME_REGEX = r'[0-9]+\s-\s[A-Za-z\s\-]+'

def generate_hashes(start_path):
    hashes = {}

    for root, _, files in os.walk(start_path):
        for file in files:
            if not file.endswith((".pdf", ".docx")):
                continue

            with open(os.path.join(root, file), "rb") as f:
                digest = hashlib.file_digest(f, "sha256")

            dhash = digest.hexdigest()

            # Potentially convert tuple to DAO
            path = os.path.join(root, file)
            year = root[len(root) - 5:].replace('_', '~')
            student_name = re.findall(NAME_REGEX, file)[0].split(' - ')[1].strip().replace('-', ' ')
            data = (path, year, student_name)

            if dhash in hashes.keys():
                hashes[dhash].append(data)
            else:
                hashes[dhash] = [data]

    return hashes



In [None]:
results = generate_hashes(RAW_PATH)

#### Generate summary stats on duplicates

In [None]:
def stats(hashes):
    print("Number of submission: ", sum([len(p) for p in hashes.values()]))
    print("Number of unique solutions:", len(hashes))
    print("Number of duplicates: ", sum([len(p) for p in hashes.values()]) - len(hashes))
    print("Max number of duplicates on a specific solution", max([len(p) for p in hashes.values()]))

stats(results)

### Find duplicate paths in raw

In [None]:
RAW_PATH = "data/raw"
DATE_REGEX = r'[0-9]+~[0-9]+'

path_map = {}

for root, _, files in os.walk(RAW_PATH):
    for file in files:
        if not file.endswith((".jar", ".zip")):
            continue

        regex_lst = re.findall(DATE_REGEX, root)

        if len(regex_lst) == 1:
            date = regex_lst[0]
        else:
            date = "21~22"

        if date not in path_map.keys():
            path_map[date] = {}

        path_split = root.split('/')

        if len(path_split) < 4:
            continue

        student_name = path_split[3].split('_')[0].replace('-', ' ')

        path_map[date][student_name] = os.path.join(root, file)

### Remove duplicates, moved to proc and rename

In [None]:
PROCESSED_PATH = "data/proc"

missing_submissions = []

if not os.path.exists(PROCESSED_PATH):
    os.mkdir(PROCESSED_PATH)

for counter, duplicate in enumerate(results.values()):
    year = duplicate[0][1]
    student_name = duplicate[0][2]

    if student_name not in path_map[year]:
        missing_submissions.append((year, student_name))
        continue
    path = path_map[duplicate[0][1]][duplicate[0][2]]


    if not os.path.exists(os.path.join(PROCESSED_PATH, year)):
        os.mkdir(os.path.join(PROCESSED_PATH, year))

    file_name = year + "_Submission_" + str(counter)

    proc_path = os.path.join(PROCESSED_PATH, year, file_name)

    if not os.path.exists(proc_path):
        os.mkdir(proc_path)

        file_ext = path.split(".")

        if len(file_ext) == 1:
            ext = ".jar"
        else:
            ext = "." + file_ext[len(file_ext) - 1]

        new_path = file_name + ext

        shutil.copyfile(path, os.path.join(proc_path, new_path))

# Not found in automated processing, due to differences in names, can be manually extracted
len(missing_submissions)

## File Processing

In [None]:
def extract_files():
    for root, _, files in os.walk(PROCESSED_PATH):
        for file in files:
            if file.endswith(".jar") or file.endswith(".zip"):
                path = os.path.join(root, file).replace(" ", '\ ')

                try:
                    os.system('unzip ' + path + " -d ./" + root.replace(" ", "\ ") + ">/dev/null 2>&1")
                    os.remove(path)
                except FileNotFoundError:
                    print("Failed to unzip: ", path)
                    continue


In [None]:
extract_files()

In [None]:
extract_files() # Run a second time to extract any jar files that were zipped

### Remove unnecessary files

In [None]:
for root, subdirs, files in os.walk(PROCESSED_PATH):

    for subdir in subdirs:
        if subdir.startswith((".", "_")) or subdir == "doc" or subdir == 'META-INF' or subdir == 'out':
            shutil.rmtree(os.path.join(root, subdir))

    for file in files:
        if file.lower().endswith((".class", ".ctxt", "~", ".", ".pdf")):
            os.remove(os.path.abspath(os.path.join("./", root, file)))

        if file.lower().endswith((".zip", ".jar")) and not 'lib' in root:
            os.remove(os.path.abspath(os.path.join("./", root, file)))

## Anonymisation
### Remove @author lines from all files

In [None]:
AUTHOR_TAG_REGEX = '\s*\*\s*@author.+'

for root, _, files in os.walk(PROCESSED_PATH):
    for file in files:
        if file.endswith(".java"):
            with open(os.path.join(root, file), "r", encoding="ISO-8859-1") as f:
                source_code = f.read()

            source_code = re.sub(AUTHOR_TAG_REGEX, "", source_code)

            with open(os.path.join(root, file), "w", encoding='utf-8') as f:
                f.write(source_code)
