# MD5 Validator

Notebook to validate submitted pdf files against submitted md5 checksums. Student id must be checked manually by a tutor.  

The first stage of this notebook takes as input I1: 
- raw HTML files with submitted md5 from students
- raw pdf files submitted by students
- csv file with student information (Matrikel, Name, Mail, SS (Startsemester), PO (Prüfungsordnung))

In the first stage we generate the following output s(I1)=O1: 
- data frame and CSV file with md5 of pdf files
- data frame file with submitted md5 from students
- data frame and CSV file with valid submissions
- data frame and CSV file with invalid submissions

In the first stage, we also rename the pdf files according to the valid submissions data frame. After the first stage, the valid pdf files are graded and stored in the korrigiert directory. 

The second stage takes as input I2: 
- graded pdf files 

In the second stage we generate the following output s(I2)=O2: 
- encrypted graded pdf files

After the graded pdf files are encrypted, they are ready to be uploaded publicly. The encrypted files have an owner key that can open all files. 

In [None]:
# MODIFY BEFORE RUNNING STAGE 1!
path = 'tx'

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import glob
import re
import os

from bs4 import BeautifulSoup
import requests

import hashlib
import unicodedata

# Stage 1

In this stage we: 
- Generate MD5 from uploaded PDF files
- Store MD5 from PDF files as data frame A
- Scrape MD5 from HTML files
- Extract name from HTML files
- Store scraped MD5 and extracted names as data frame B
- Inner join A and B to determine valid submissions; store the resulting data frame C
- Read student id data and store it as data frame D
- Inner join C and D to get additional information for the valid submissions; store the resulting data frame E

Don't forget to normalize strings and remove whitespaces!

In [None]:
def list_md5(path):
    '''
    Takes in a path, returns list of strings. Iterate all HTML files in the path and scrape the submitted MD5s
    '''
    # list of all the files in the directory
    html_files = glob.glob(path + '/*.html')
    
    # make empty list to append all the matches
    match_list = []
    
    # iterate all the files in the directory and find matches
    for file in html_files: 
        with open(file) as f: 
            soup = BeautifulSoup(f, 'lxml')
            match = soup.body.text
            
            # normalize before appending
            match = unicodedata.normalize('NFD', match)
            match = match.strip()
            
            match_list.append(match)
                  
    return match_list

In [None]:
def list_names(path): 
    '''
    Takes in a path, returns list of strings. Iterate all HTML files in the path and extract student name from the file name
    '''
    # list of all the files in the directory
    html_files = glob.glob(path + '/*.html')
    
    # make empty list to append all the matches
    match_list = []
    
    # iterate all the files in the directory and find matches
    for file in html_files:
        # remove directory
        file = re.sub(r'\..+\/','',file)
        
        # remove everything after the _ 
        file = re.sub(r'_.+','',file)
        
        # normalize before appending
        file = unicodedata.normalize('NFD', file)
        
        # append the result to the list
        match_list.append(file)
        
    return match_list

In [None]:
def file_to_md5(path):
    '''
    Takes in a path, returns list of strings. Iterate all files in the path and generate MD5
    '''
    # list of all the files in the directory
    files = glob.glob(path + '/*.*')
    
    # make empty list to append all the matches
    md5_list = []
    
    # make set of seen items
    seen = set(md5_list)
    
    # iterate all the files in the directory and generate md5; this also removes the duplicates in the files
    for file in files:
        with open(file, 'rb') as rbf: 
            content = rbf.read()
            h = hashlib.md5(content).hexdigest()
            
            # normalize before appending
            h = unicodedata.normalize('NFD', h)
            
            if h not in seen: 
                seen.add(h)
                md5_list.append(h)
            else: 
                os.remove(file)
    return md5_list

In [None]:
df_pdf = pd.DataFrame({'MD5':file_to_md5(f'./{path}/pdf')})

In [None]:
df_pdf.head()

In [None]:
len(df_pdf)

In [None]:
df_pdf.to_csv(f'./{path}/pdf-table.csv',index=False)

In [None]:
df_sub = pd.DataFrame({'MD5':list_md5(f'./{path}/md5'),'Name':list_names(f'./{path}/md5')})

In [None]:
df_sub.head()

In [None]:
len(df_sub)

In [None]:
df_sub.to_csv(f'./{path}/sub-table.csv',index=False)

In [None]:
df_valid = pd.merge(df_pdf,df_sub,how='inner',on='MD5')

In [None]:
df_valid.head()

In [None]:
len(df_valid)

In [None]:
df_valid.head()

In [None]:
# df_valid['Name'] = df_valid['Name'].str.encode('cp273')

In [None]:
# df_valid.head()

Submitted but not valid: 

In [None]:
df_sub_not_val = pd.merge(df_valid, df_sub, how='outer',on='MD5')

In [None]:
df_sub_not_val = df_sub_not_val[df_sub_not_val.isnull().any(axis=1)]

In [None]:
df_sub_not_val

In [None]:
df_sub_not_val.to_csv(f'./{path}/sub-not-val-table.csv',index=False)

In [None]:
df_id = pd.read_csv('./ids.csv')

In [None]:
df_id.head()

In [None]:
# df_id.to_csv('./test-id2.csv', index=False)

In [None]:
df_id['Name'] = df_id['Name'].apply(lambda name : unicodedata.normalize('NFD', name))

In [None]:
df_id.head()

In [None]:
len(df_id)

In [None]:
# df_id[df_id['Name']==b'Clarissa Schu\xcc\x88tt']

In [None]:
# df_id['Name'] = df_id['Name'].apply(lambda name : name.encode())

In [None]:
df_mnr = pd.merge(df_valid, df_id, how='inner',on='Name')

In [None]:
df_mnr.head()

In [None]:
len(df_mnr)

## Check if the following statement is TRUE!

If the following is not true, update the (student) IDs table. 

In [None]:
len(df_valid) == len(df_mnr)

In [None]:
df_mnr.to_csv(f'./{path}/valid-table.csv',index=False)

## Rename PDF files

Rename the valid submitted PDF files before grading them. 

In [None]:
md5_dic = df_mnr.to_dict()['MD5']

In [None]:
matrikel_dic = df_mnr.to_dict()['Matrikel']

In [None]:
def rename_file(path):
    files = glob.glob(path + '/*.*')
    
    for file in files:
        with open(file, 'rb') as rbf: 
            content = rbf.read()
            h = hashlib.md5(content).hexdigest()
            for key, val in md5_dic.items(): 
                if val == h: 
                    os.rename(file, f'./{path}/{matrikel_dic[key]}.pdf')

In [None]:
rename_file(f'./{path}/pdf')

---
**END OF STAGE 1**

---

# Stage 2

In this stage we: 

- encrypt the graded PDF files against the data frame with valid submissions

Rerun the whole notebook before Stage 2, to store the data frame with valid submissions in memory

In [None]:
import pikepdf
from pikepdf import Pdf

In [None]:
# Load configuration without altering the environment
from dotenv import dotenv_values
config = dotenv_values(".env")

In [None]:
OWNER, = config.values()

In [None]:
def encrypt_file(path_):
    os.mkdir(f'./{path}/korrigiert-e')
    
    files = glob.glob(path_ + '/*.*')
    
    for file in files:
        # print(file)
        file_name = file.split('/')[3]
        file_name = file_name.split('.')[0]
        # print(file_name)
        for key, val in matrikel_dic.items(): 
            if file_name == val: 
                pdf = Pdf.open(file)    
                
                pdf.save(f'./{path}/korrigiert-e/{matrikel_dic[key]}-e.pdf', encryption=pikepdf.Encryption(owner=OWNER, user=f'{md5_dic[key]}', R=4)) 
                # you can change the R from 4 to 6 for 256 aes encryption
                pdf.close()

In [None]:
encrypt_file(f'./{path}/korrigiert')

---
**END OF STAGE 2**

---

**DONE!**