In [7]:
from pyunpack import Archive
import requests
import os

directory_path = './Dokaz'

def create_directory(directory_path):
    try:
        os.makedirs(directory_path, exist_ok=True)
        print(f"Directory '{directory_path}' created or already exists.")
    except Exception as e:
        print(f"Failed to create directory '{directory_path}': {e}")


def download_file_with_custom_headers(url, local_filename):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    with requests.get(url, headers=headers, stream=True) as r:
        r.raise_for_status()  # This will throw an exception for non-200 responses
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print("File downloaded successfully.")


# Example usage
file_url = 'https://github.com/toperkov/RF-FESB-2023-24/raw/main/lab2/Download/Dokaz.zip'
output_dir = './Dokaz.zip'
download_file_with_custom_headers(file_url, output_dir)
create_directory(directory_path)

Archive('Dokaz.zip').extractall("Dokaz")


File downloaded successfully.
Directory './Dokaz' created or already exists.


In [27]:
import os
import pandas as pd
import hashlib
import magic
import mimetypes

# specify the directory path where the files are located
dir_path = './Dokaz/'

# create an empty list to store the file names
file_names = []
extensions = []
md5s = []
sha1s = []
sha256s = []
magic_numbers = []
extension_matches = []

f = magic.Magic()

check_hash = "c15e32d27635f248c1c8b66bb012850e5b342119"
real_file = ""


def file2hash(filename, hash_function):
    with open(filename, 'rb', buffering=0) as f:
        return hash_function(f.read()).hexdigest()


# iterate through all files in the directory
for file in os.listdir(dir_path):
    # check if the file is a regular file (i.e., not a directory)
    path = os.path.join(dir_path, file)
    
    if os.path.isfile(os.path.join(dir_path, file)):
        # if so, add the file name to the list
        name, extension = os.path.splitext(file)
        md5 = file2hash(path,hashlib.md5)
        md5s.append(md5)
        sha1 = file2hash(path,hashlib.sha1)
        sha1s.append(sha1)
        sha256 = file2hash(path,hashlib.sha256)
        sha256s.append(sha256)

        if (sha256 == check_hash or sha1 == check_hash or md5 == check_hash):
            real_file = file

        file_names.append(name)
        extensions.append(extension)
        
        magic_number = f.from_file(path)
        magic_numbers.append(magic_number)
        # check if the magic number contains the file extension
        if extension.lower() == '':
            extension_matches.append(False)
        elif mimetypes.guess_type('test'+extension.lower())[0] in magic_number.lower():
            extension_matches.append(True)
        else:
            extension_matches.append(False)

# create a Pandas dataframe with the file names
df = pd.DataFrame({'file_name': file_names, 'extension': extensions, 'md5': md5s, 'magic_numbers': magic_numbers, 'extension_matches': extension_matches})

# print the dataframe
print(df)
print(f"Stolen file: {real_file}")

        file_name extension                               md5  \
0  Secret_file_11      .txt  187b0341b64a0e16d74ea3b50e22e6c2   
1  Secret_file_12      .pdf  a0c34dd882f1e7e5af2341b1aadb532c   
2  Secret_file_22      .png  40bba5dd7b99c5e99d0877993ec3f3fb   
3  Secret_file_48     .docx  7a2fdf83ebce571af7fcb93a8825ead1   
4  Secret_file_49      .pdf  40bba5dd7b99c5e99d0877993ec3f3fb   
5  Secret_file_52      .jpg  8de0128acae226c00efb5f98a9486e68   
6  Secret_file_72     .docx  098cb2f52dda9e3195075d765a9ff01f   
7  Secret_file_92      .jpg  4266ea20320c598bc5d7b1d731e3e9e9   

                                       magic_numbers  extension_matches  
0       UTF-8 Unicode text, with no line terminators              False  
1                          PDF document, version 1.3              False  
2                          PDF document, version 1.3              False  
3                               Microsoft Word 2007+              False  
4                          PDF document, ver