Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions find_duplicates.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy
import numpy.typing
from typing import Iterable, Optional, Self


Expand Down Expand Up @@ -71,7 +72,7 @@ def __str__(self) -> str: # Used solely for debugging


def _initialize_segments(
matrix: numpy.ndarray, is_single_file: bool
matrix: numpy.typing.NDArray[numpy.uint8], is_single_file: bool
) -> tuple[list[_SegmentUnionFind], dict[_Coordinates, _SegmentUnionFind]]:
"""
Each _SegmentUnionFind we return has size at least 2: these have already
Expand Down Expand Up @@ -111,7 +112,7 @@ def _initialize_segments(


def _get_pixel_to_segment(
matrix: numpy.ndarray, is_single_file: bool
matrix: numpy.typing.NDArray[numpy.uint8], is_single_file: bool
) -> dict[_Coordinates, _SegmentUnionFind]:
"""
If is_single_file is set, we do not include pixels on the main diagonal,
Expand Down Expand Up @@ -158,7 +159,9 @@ def key(
return pixel_to_segment


def get_lengths(matrix: numpy.ndarray, is_single_file: bool) -> numpy.ndarray:
def get_lengths(
matrix: numpy.typing.NDArray[numpy.uint8], is_single_file: bool
) -> numpy.typing.NDArray[numpy.uint32]:
"""
We return an image whose pixels indicate how long a chain of nonzero values
from the original matrix is. If is_single_file is set, the main diagonal
Expand All @@ -175,7 +178,7 @@ def get_lengths(matrix: numpy.ndarray, is_single_file: bool) -> numpy.ndarray:


def get_segments(
matrix: numpy.ndarray, is_single_file: bool
matrix: numpy.typing.NDArray[numpy.uint8], is_single_file: bool
) -> set[_SegmentUnionFind]:
"""
We return set of _SegmentUnionFinds describing all the segments we found in
Expand Down Expand Up @@ -247,7 +250,11 @@ def update_candidate(candidate: _SegmentUnionFind) -> None:
return best_candidate


def get_hues(matrix: numpy.ndarray, is_single_file: bool) -> numpy.ndarray:
def get_hues(
matrix: numpy.typing.NDArray[numpy.uint8], is_single_file: bool
) -> numpy.typing.NDArray[numpy.uint8]:
# Scores are going to start out as uint32's, but get turned into floats.
scores: numpy.typing.NDArray
scores = get_lengths(matrix, is_single_file)
# Cut everything off at the max, then divide by the max to put all values
# between 0 and 1.
Expand Down
9 changes: 5 additions & 4 deletions gui.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from math import ceil
import numpy
import numpy.typing
import tkinter as tk
import tkinter.font as tkfont
from typing import Optional
Expand Down Expand Up @@ -130,8 +131,8 @@ def display(self, pixel: int) -> None:
class _Gui(tk.Frame):
def __init__(
self,
matrix: numpy.ndarray,
hues: Optional[numpy.ndarray],
matrix: numpy.typing.NDArray[numpy.uint8],
hues: Optional[numpy.typing.NDArray[numpy.uint8]],
data_a: FileInfo,
data_b: FileInfo,
map_width: int,
Expand All @@ -154,8 +155,8 @@ def _on_motion(self, event: tk.Event) -> None:


def launch(
matrix: numpy.ndarray,
hues: Optional[numpy.ndarray],
matrix: numpy.typing.NDArray[numpy.uint8],
hues: Optional[numpy.typing.NDArray[numpy.uint8]],
data_a: FileInfo,
data_b: FileInfo,
map_width: int,
Expand Down
9 changes: 5 additions & 4 deletions image_pyramid.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy
import numpy.typing
from typing import Optional

import utils
Expand All @@ -9,8 +10,8 @@ class ImagePyramid:

def __init__(
self,
matrix: numpy.ndarray,
hues: Optional[numpy.ndarray],
matrix: numpy.typing.NDArray[numpy.uint8],
hues: Optional[numpy.typing.NDArray[numpy.uint8]],
sidelength: int,
) -> None:
"""
Expand All @@ -20,7 +21,7 @@ def __init__(
self._pyramid.append(matrix)
self._sidelength = sidelength

self._hue_pyramid: Optional[list[numpy.ndarray]]
self._hue_pyramid: Optional[list[numpy.typing.NDArray[numpy.uint8]]]
if hues is None:
self._hue_pyramid = None
else:
Expand Down Expand Up @@ -85,7 +86,7 @@ def __init__(

def get_submatrix(
self, top_left_x: int, top_left_y: int
) -> tuple[numpy.ndarray, int, int]:
) -> tuple[numpy.typing.NDArray[numpy.uint8], int, int]:
"""
We return a sidelength-by-sidelength-by-3 ndarray containing an HSV
image of the relevant region, and the indices of the top-left corner.
Expand Down
19 changes: 18 additions & 1 deletion tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import code_tokenize
import numpy
import numpy.typing
from typing import NamedTuple, Optional

from code_tokenize.tokens import ASTToken
Expand All @@ -9,11 +10,27 @@

# Syntactic sugar: a Boundary contains the start and end of a token, where
# each position is described by its line number and the column within the line.
# The first line of the file is line 1, but the first column of the line is
# column 0. The end is the first location *after* the end of the token (which
# might be 1 character past the end of the current line, if this is the last
# token on the line).
# Example: This file:
# print("hi")
# print("bye")
# Likely has these boundaries:
# ((1, 0), (1, 5)) print
# ((1, 5), (1, 6)) (
# ((1, 6), (1, 10)) "hi"
# ((1, 10), (1, 11)) )
# ((2, 0), (2, 5)) print
# ((2, 5), (2, 6)) (
# ((2, 6), (2, 11)) "bye"
# ((2, 11), (2, 12)) )
Boundary = tuple[tuple[int, int], tuple[int, int]]


class FileInfo(NamedTuple):
tokens: numpy.ndarray # Really a list[code_tokenize.tokens.Token]
tokens: numpy.typing.NDArray[numpy.str_]
lines: list[str]
boundaries: list[Boundary]
filename: str
Expand Down
15 changes: 8 additions & 7 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import numpy
import numpy.typing
from typing import Optional


PIXELS_IN_BIG_FILE = 50 * 1000 * 1000 # 50 megapixels


def to_hsv_matrix(matrix: numpy.ndarray,
hues: Optional[numpy.ndarray]) -> numpy.ndarray:
def to_hsv_matrix(
matrix: numpy.typing.NDArray[numpy.uint8],
hues: Optional[numpy.typing.NDArray[numpy.uint8]],
) -> numpy.typing.NDArray[numpy.uint8]:
"""
The matrix is a 2D array of uint8's. The hues are either None or another 2D
array of the same shape.
Expand All @@ -22,12 +25,10 @@ def to_hsv_matrix(matrix: numpy.ndarray,
return result


# The two arguments to make_matrix both have type
# list[code_tokenize.tokens.ASTToken], but that module does not have type
# annotations and adding them in would be annoying.
def make_matrix(
tokens_a: numpy.ndarray, tokens_b: numpy.ndarray
) -> numpy.ndarray:
tokens_a: numpy.typing.NDArray[numpy.str_],
tokens_b: numpy.typing.NDArray[numpy.str_]
) -> numpy.typing.NDArray[numpy.uint8]:
matrix = numpy.zeros([len(tokens_a), len(tokens_b)], dtype=numpy.uint8)
for i, value in enumerate(tokens_a):
matrix[i, :] = (tokens_b == value)
Expand Down
9 changes: 6 additions & 3 deletions zoom_map.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from functools import partial
import numpy
import numpy.typing
import PIL.Image
import PIL.ImageTk
import tkinter as tk
Expand All @@ -11,8 +12,8 @@ class ZoomMap(tk.Canvas):
def __init__(
self,
tk_parent: tk.Widget,
matrix: numpy.ndarray,
hues: Optional[numpy.ndarray],
matrix: numpy.typing.NDArray[numpy.uint8],
hues: Optional[numpy.typing.NDArray[numpy.uint8]],
sidelength: int,
) -> None:
super().__init__(tk_parent, height=sidelength, width=sidelength,
Expand Down Expand Up @@ -110,7 +111,9 @@ def _on_unclick(self, event: tk.Event) -> None:
self._set_image()

@staticmethod
def _to_image(matrix: numpy.ndarray) -> PIL.ImageTk.PhotoImage:
def _to_image(
matrix: numpy.typing.NDArray[numpy.uint8]
) -> PIL.ImageTk.PhotoImage:
image = PIL.Image.fromarray(matrix, mode="HSV")
return PIL.ImageTk.PhotoImage(image)

Expand Down