# MD5 Validator

Notebook to validate submitted pdf files against submitted md5 checksums. Student id must be checked manually by a tutor.  

The first stage of this notebook takes as input I1: 
- raw HTML files with submitted md5 from students
- raw pdf files submitted by students
- csv file with student information (Matrikel, Name, Mail, SS (Startsemester), PO (Prüfungsordnung))

In the first stage we generate the following output s(I1)=O1: 
- data frame and CSV file with md5 of pdf files
- data frame file with submitted md5 from students
- data frame and CSV file with valid submissions
- data frame and CSV file with invalid submissions

In the first stage, we also rename the pdf files according to the valid submissions data frame. After the first stage, the valid pdf files are graded and stored in the korrigiert directory. 

The second stage takes as input I2: 
- graded pdf files 

In the second stage we generate the following output s(I2)=O2: 
- encrypted graded pdf files

After the graded pdf files are encrypted, they are ready to be uploaded publicly. The encrypted files have an owner key that can open all files. 

In [None]:
# CHANGE BEFORE RUNNING SCRIPT #
path = 't5'

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import glob
import re
import os

from bs4 import BeautifulSoup
import requests

import hashlib
import unicodedata

# Stage 1: Validation

In this stage we: 
- Generate MD5 from uploaded PDF files
- Store MD5 from PDF files as data frame A
- Scrape MD5 from HTML files
- Extract name from HTML files
- Store scraped MD5 and extracted names as data frame B
- Inner join A and B to determine valid submissions; store the resulting data frame C
- Read student id data and store it as data frame D
- Inner join C and D to get additional information for the valid submissions; store the resulting data frame E

Don't forget to normalize strings and remove whitespaces!

In [None]:
def list_md5(path):
    '''
    Input: directory with HTML files of MD5 student submissions
    Output: list of student submissions as string objects
    Comment: Iterate all HTML files in the directory and scrape the submitted MD5s
    '''
    # list of all html files in the directory
    html_files = glob.glob(path + '/*.html')
    
    # make empty list to append all the matches
    match_list = []
    
    # iterate all html files in the directory and find body (match)
    for file in html_files: 
        # open file
        with open(file) as f: 
            # initiate soup instance
            soup = BeautifulSoup(f, 'lxml')
            # find body text
            match = soup.body.text
            # normalize, strip and lower-case the string before appending
            match = unicodedata.normalize('NFD', match)
            match = match.strip().lower()
            match_list.append(match)
    # return the list of matches, i.e. body of HTML files
    return match_list

In [None]:
def list_names(path): 
    '''
    Input: directory with HTML files of MD5 student submissions
    Output: list of student names from MD5 submission
    Comment: iterate all HTML files in the directory and extract student name from the file name
    '''
    # list of all the files in the directory
    html_files = glob.glob(path + '/*.html')
    # make empty list to append all the matches
    match_list = []
    # iterate all the files in the directory and find matches
    for file in html_files:
        # remove directory
        file = re.sub(r'\..+\/','',file)
        # remove everything after the '_'
        # everything before the '_' and without the directory is the student name
        file = re.sub(r'_.+','',file)
        # normalize before appending
        file = unicodedata.normalize('NFD', file)
        # append the result to the list
        match_list.append(file)
    # return the list of matches, i.e. student names from submissions
    return match_list

In [None]:
def file_to_md5(path):
    '''
    Input: directory with PDF submissions
    Output: list of MD5 from PDF submissions
    Comment: iterate all files in the path and generate MD5
    '''
    # list of all the files in the directory
    files = glob.glob(path + '/*.*')
    # make empty list to append all the matches
    md5_list = []
    # make set of seen items because there are dups in the directory
    seen = set(md5_list)
    # iterate all the files in the directory and generate md5 for each of them
    # remove dups
    for file in files:
        # open file in binary format for reading
        with open(file, 'rb') as rbf: 
            # read content of the binary file
            content = rbf.read()
            # hash the content
            h = hashlib.md5(content).hexdigest()
            # normalize before appending
            h = unicodedata.normalize('NFD', h)
            # check if we already saw the hash
            if h not in seen: 
                # add to seen set and list if it's a new hash
                seen.add(h)
                md5_list.append(h)
            else: 
                # delete dup if we already saw the hash
                os.remove(file)
    # return the list of MD5s
    return md5_list

## Hashes of PDF Submissions

In [None]:
# make dataframe with hashes of submitted PDF files
df_pdf = pd.DataFrame({'MD5':file_to_md5(f'./{path}/pdf')})

In [None]:
df_pdf.head()

In [None]:
len(df_pdf)

In [None]:
# export hashes of submitted PDF files
df_pdf.to_csv(f'./{path}/pdf-hashes.csv',index=False)

## Hash Submissions with Student Names

In [None]:
# make dataframe of submitted hashes and student names
df_sub = pd.DataFrame({'MD5':list_md5(f'./{path}/md5'),'Name':list_names(f'./{path}/md5')})

In [None]:
df_sub.head()

In [None]:
len(df_sub)

In [None]:
# export submitted hashes and student names 
df_sub.to_csv(f'./{path}/md5-sub.csv',index=False)

## Valid Submissions

In [None]:
# determine valid submissions on hash
df_valid = pd.merge(df_pdf,df_sub,how='inner',on='MD5')

In [None]:
df_valid.head()

In [None]:
len(df_valid)

In [None]:
df_valid.head()

## Invalid Submissions

In [None]:
# PDF file was submitted with no hash
# OR hash was submitted with no PDF file or wrong PDF file
df_sub_not_val = pd.merge(df_valid, df_sub, how='outer',on='MD5')

In [None]:
df_sub_not_val = df_sub_not_val[df_sub_not_val.isnull().any(axis=1)]

In [None]:
df_sub_not_val

In [None]:
# clean up dataframe
# we export later in the notebook
df_sub_not_val = df_sub_not_val.rename(columns={'Name_y':'Name'}).drop('Name_x',axis=1)

## Student Information

In [None]:
# read in student information and make dataframe
df_id = pd.read_csv('./id_clean_updated_nodups.csv')

In [None]:
df_id.head()

In [None]:
# normalize student information
df_id['Name'] = df_id['Name'].apply(lambda name : unicodedata.normalize('NFD', name))

In [None]:
df_id.head()

In [None]:
len(df_id)

## Valid Submissions with Student Information

In [None]:
df_mnr = pd.merge(df_valid, df_id, how='inner',on='Name')

In [None]:
df_mnr.head()

In [None]:
len(df_mnr)

In [None]:
len(df_valid)

In [None]:
# if this statement is not True, update the student information (student record missing)
len(df_valid) == len(df_mnr)

In [None]:
# export valid submission with student information
df_mnr.to_csv(f'./{path}/valid-sub.csv',index=False)

## Invalid Submissions with Student Information

In [None]:
df_sub_not_val = pd.merge(df_sub_not_val,df_id,how='inner',on='Name')

In [None]:
df_sub_not_val.head()

In [None]:
len(df_sub_not_val)

In [None]:
# export invalid submission with student information
df_sub_not_val.to_csv(f'./{path}/invalid-sub.csv',index=False)

## Rename Files

In [None]:
# create dictionary of hashes
md5_dic = df_mnr.to_dict()['MD5']

In [None]:
# create dicionary of student numbers
matrikel_dic = df_mnr.to_dict()['Matrikel']

In [None]:
def rename_file(path):
    '''
    Input: directory with submitted files (no dups)
    Output: directory with renamed submitted files (student number)
    Comments: most files have a PDF extension but not all
    '''
    # make list of all the files
    files = glob.glob(path + '/*.*')
    
    # list of non-pdf files
    non_pdf = []
    
    # initiate count
    count_pdf_no_hash_sub = 0
    
    # iterate over each file in the directory
    for file in files:
        # assume md5 was not submitted
        md5_submitted = False
        # get file extension
        f_name, f_extension = os.path.splitext(file)
        
        # turn jpeg -> jpg
        if f_extension in ['.jpeg','.jpg']: 
            f_extension = '.jpg'
            non_pdf.append(f_name)
        
        # open the file in read binary mode
        with open(file, 'rb') as rbf: 
            # read contents of the file
            content = rbf.read()
            # generate hash of the content
            h = hashlib.md5(content).hexdigest()
            # iterate over dictionary of known valid submissions with known student number
            for key, val in md5_dic.items(): 
                # if the known hash and the generated hash match, rename the file
                if val == h: 
                    os.rename(file, f'./{path}/{matrikel_dic[key]}{f_extension}')
                    md5_submitted = True
        
        # rename the file if hash is not in the valid list and increase count
        if md5_submitted == False: 
            os.rename(file, f'./{path}/no-hash-sub-{count_pdf_no_hash_sub}{f_extension}')
            count_pdf_no_hash_sub += 1 
    
    # print count of PDF submissions where hash was not submitted, i.e. PDF submission is not valid
    print(f'Number PDFs with no hash submission: {count_pdf_no_hash_sub}')
    print(non_pdf)

In [None]:
rename_file(f'./{path}/pdf')

In [None]:
import img2pdf

In [None]:
def turn_to_pdf(path):
    # make list of graded tests that are jpg
    jpg_files = glob.glob(path+'/*.jpg')
    
    # iterative over list of graded tests that are jpg
    for jpg_file in jpg_files:
        f_name, f_extension = os.path.splitext(jpg_file)
        print(f_name)
        # get directory of the test to convert into pdf and encrypt it
        test = f'{f_name}.pdf'
        # convert jpg into pdf
        with open(test,'wb') as f: 
            f.write(img2pdf.convert(jpg_file))
        os.remove(jpg_file)

In [None]:
turn_to_pdf(f'./{path}/pdf')