In [3]:
filepath = "example.zip"

In [1]:
import zipfile

In [4]:
with zipfile.ZipFile(filepath, "r") as myzip:
    for info in myzip.infolist():
        print(info.filename)

example/
example/dir/
example/dir/file2
example/file1


In [5]:
with zipfile.ZipFile(filepath, "r") as myzip:
    with myzip.open("example/file1") as myfile:
        print(myfile.read())

b'file1\n'


In [6]:
def dump(body):
    print("         0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F")
    ascii = ""
    i = 0
    l = 0
    print(f"{l:08X}:", end="")
    for byte in body:
        print(f"{byte:02X} ", end="")
        if byte >= 0x20 and byte <= 0x7E:
            ascii += chr(byte)
        else:
            ascii += "."
        i += 1
        if i == 16:
            print(" ", ascii)
            ascii = ""
            i = 0
            l += 16
            print(f"{l:08X}:", end="")
    if i != 0:
        print("   " * (16 - i), end="")
        print(" ", ascii)


def dump2(filepath):
    with open(filepath, "rb") as f:
        body = f.read()
        dump(body)

In [7]:
dump2(filepath)

         0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
00000000:50 4B 03 04 00 00 00 08 00 00 EE 5E AE 5A 00 00   PK.........^.Z..
00000010:00 00 00 00 00 00 00 00 00 00 08 00 00 00 65 78   ..............ex
00000020:61 6D 70 6C 65 2F 50 4B 03 04 00 00 00 08 00 00   ample/PK........
00000030:EE 5E AE 5A 00 00 00 00 00 00 00 00 00 00 00 00   .^.Z............
00000040:0C 00 00 00 65 78 61 6D 70 6C 65 2F 64 69 72 2F   ....example/dir/
00000050:50 4B 03 04 14 00 08 08 08 00 EE 5E AE 5A 00 00   PK.........^.Z..
00000060:00 00 00 00 00 00 00 00 00 00 11 00 00 00 65 78   ..............ex
00000070:61 6D 70 6C 65 2F 64 69 72 2F 66 69 6C 65 32 01   ample/dir/file2.
00000080:06 00 F9 FF 66 69 6C 65 32 0A 50 4B 07 08 C7 A4   ....file2.PK....
00000090:04 C9 0B 00 00 00 06 00 00 00 50 4B 03 04 14 00   ..........PK....
000000A0:08 08 08 00 EE 5E AE 5A 00 00 00 00 00 00 00 00   .....^.Z........
000000B0:00 00 00 00 0D 00 00 00 65 78 61 6D 70 6C 65 2F   ........example/
000000C0:66 69 6C 65 31 01 06 00

In [8]:
with open(filepath, "rb") as f:
    body = f.read()

    # Find End of Central Directory
    offset_eocd = body.rfind(b"\x50\x4b\x05\x06")
    print(f"{offset_eocd:08X}")

000001CA


In [9]:
import struct

with open(filepath, "rb") as f:
    body = f.read()

    offset_eocd = body.rfind(b"\x50\x4b\x05\x06")
    body_eocd = struct.unpack("<4s4H2LH", body[offset_eocd : offset_eocd + 22])
    print(body_eocd)

(b'PK\x05\x06', 0, 0, 4, 4, 234, 224, 0)


In [10]:
import struct

with open(filepath, "rb") as f:
    body = f.read()

    offset_eocd = body.rfind(b"\x50\x4b\x05\x06")
    body_eocd = struct.unpack("<4s4H2LH", body[offset_eocd : offset_eocd + 22])

    offset = body_eocd[6]
    offset_cd = [0 for i in range(body_eocd[4])]
    for i in range(body_eocd[4]):
        offset_cd[i] = offset
        (n, m, k) = struct.unpack("<3H", body[offset + 28 : offset + 34])
        print(f"{offset:08X}:{n},{m},{k}")
        offset += 46 + n + m + k

000000E0:8,0,0
00000116:12,0,0
00000150:17,0,0
0000018F:13,0,0


In [12]:
import zlib

with open(filepath, "rb") as f:
    body = f.read()

offset_eocd = body.rfind(b"\x50\x4b\x05\x06")
body_eocd = struct.unpack("<4s4H2LH", body[offset_eocd : offset_eocd + 22])

offset = body_eocd[6]
offset_cd = [0 for i in range(body_eocd[4])]
offset_lf = [0 for i in range(body_eocd[4])]

for i in range(body_eocd[4]):
    offset_cd[i] = offset
    (n, m, k) = struct.unpack("<3H", body[offset + 28 : offset + 34])
    offset_lf[i] = struct.unpack("<L", body[offset + 42 : offset + 46])[0]
    header_ld = struct.unpack("<4s5H3L2H", body[offset_lf[i] : offset_lf[i] + 30])
    (n2, m2) = header_ld[9:11]
    filename = body[offset_lf[i] + 30 : offset_lf[i] + 30 + n2].decode()
    print(f"{offset_lf[i]:08X}: {filename}, {header_ld[3]}")
    if header_ld[3] == 8:
        # -15 for the window buffer will make it ignore headers/footers
        print(zlib.decompress(body[offset_lf[i] + 30 + n2 + m2 :], -15))
    offset += 46 + n + m + k

00000000: example/, 0
00000026: example/dir/, 0
00000050: example/dir/file2, 8
b'file2\n'
0000009A: example/file1, 8
b'file1\n'


In [13]:
def zip_recovery(body):
    offset_lf = 0
    while True:
        offset_lf = body.find(b"\x50\x4b\03\04", offset_lf)
        if offset_lf == -1:
            break
        header_lf = struct.unpack("<4s5H3L2H", body[offset_lf : offset_lf + 30])
        (n, m) = header_lf[9:11]
        filename = body[offset_lf + 30 : offset_lf + 30 + n].decode()
        print(f"{offset_lf:08X}:{filename},{header_lf[3]}")
        if header_lf[3] == 8:
            # -15 for the window buffer will make it ignore headers/footers
            print(zlib.decompress(body[offset_lf + 30 + n + m :], -15))
        offset_lf += 30 + n + m

In [14]:
zip_recovery(body[:0x100])

00000000:example/,0
00000026:example/dir/,0
00000050:example/dir/file2,8
b'file2\n'
0000009A:example/file1,8
b'file1\n'


In [15]:
zip_recovery(body[50:0x100])

0000001E:example/dir/file2,8
b'file2\n'
00000068:example/file1,8
b'file1\n'


In [16]:
filepath="test_file.pdf"

In [17]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.6.1-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.6.1-py3-none-any.whl (304 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.6.1


In [18]:
from pypdf import PdfReader

reader = PdfReader(filepath)

metadata = reader.metadata
print(metadata)

n_of_pages = len(reader.pages)
print(f"Total {n_of_pages} pages.")

page = reader.pages[0]
text = page.extract_text()
print(text)

{'/Producer': 'GPL Ghostscript 9.07', '/CreationDate': "D:20141210171630-05'00'", '/ModDate': "D:20141210171630-05'00'", '/Creator': 'LilyPond 2.18.2', '/Author': 'Christopher Antila', '/Title': 'Test File', '/Subtitle': 'Movement title', '/Composer': 'Christopher Antila'}
Total 1 pages.
 
Test File
Movement title
Christopher Antila
/noteheads.sM1 /noteheads.s0/accidentals.doublesharp/accidentals.flat
/scripts.umarcato/noteheads.sM1/accidentals.natural
/noteheads.s2
/noteheads.s2/noteheads.s2/accidentals.flatflat
/flags.d3
/rests.2
/noteheads.s2/noteheads.uM2/accidentals.sharp/noteheads.uM2/accidentals.natural
43/accidentals.flat/accidentals.flat/clefs.C_change
43/accidentals.flat/accidentals.flat
3
/rests.2
/noteheads.s2/noteheads.s2/noteheads.s2/accidentals.sharp/noteheads.s2/noteheads.s2/noteheads.s2/noteheads.s2
/flags.u3
/rests.3
/noteheads.s2
/scripts.staccato
/noteheads.s2
/clefs.F/noteheads.s2
88/accidentals.flat/clefs.G
/scripts.staccato
/noteheads.s2
/accidentals.flat
/bracke

In [19]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting charset-normalizer>=2.0.0 (from pdfminer.six)
  Using cached charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl.metadata (35 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six)
  Using cached cryptography-45.0.4-cp311-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)
Collecting cffi>=1.14 (from cryptography>=36.0.0->pdfminer.six)
  Using cached cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (1.5 kB)
Collecting pycparser (from cffi>=1.14->cryptography>=36.0.0->pdfminer.six)
  Using cached pycparser-2.22-py3-none-any.whl.metadata (943 bytes)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m315.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hUsing cached charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl (199 kB)
Downloading cryptography-45.0.4-c

In [20]:
import pdfminer.pdfparser
import pdfminer.pdfdocument

with open(filepath, "rb") as f:
    p = pdfminer.pdfparser.PDFParser(f)
    d = pdfminer.pdfdocument.PDFDocument(p)
    print(d.info)

[{'Producer': b'GPL Ghostscript 9.07', 'CreationDate': b"D:20141210171630-05'00'", 'ModDate': b"D:20141210171630-05'00'", 'Creator': b'LilyPond 2.18.2', 'Author': b'Christopher Antila', 'Title': b'Test File', 'Subtitle': b'Movement title', 'Composer': b'Christopher Antila'}]
