# Play with Jupyter and OpenCV
Load an entire page, crop to get only the times & depths, detect all the symbols, and extract the columns of symbols:

In [None]:
import tide_ocr
from matplotlib import pyplot as plt
cropped = tide_ocr.image_load_crop_fixup("2024/00.png", 900, 1100, 7200)
f = plt.figure(figsize=(20, 20))
plt.imshow(tide_ocr.fix_color(cropped))

In [2]:
import tide_ocr

ocr = dict()

#for i in range(2016, 2022):
#  ocr[i] = tide_ocr.SimpleOCR(str(i), i, 830, 7000, 1100, False, 0, 48)

#ocr[2022] = tide_ocr.SimpleOCR("2022", 2022, 900, 7200, 1470, False, 0, 48)
#ocr[2023] = tide_ocr.SimpleOCR("2023", 2023, 900, 7200, 1100, False, 0, 48)
ocr[2024] = tide_ocr.SimpleOCR("2024", 2024, 900, 7200, 1100, True, 0, 24)

We can use OpenCV to extract bounding-boxes for all the symbols on a page, and using the knowledge that each image forms several columns of digits, we can extract a row of symbols and view them.

In [None]:
# View the symbols from rows 8, 9, 10 & 11 from the "London Bridge" column on page 20
ocr[2024].display_column("00.png", True, 0, 4, 0)


Now we can read all 48 pages, building up a list of unique symbols. We expect to see only 0-9, "." and "-". This works by maintaining a list of symbols, and xor'ing each new symbol with all the known ones. An xor of two identical symbols will produce an all-black result. An xor of two almost-identical symbols will produce a result with fewer than 30 white pixels. This number 30 is kinda arbitrary. Set it lower and it will find more symbols that it thinks are distinct.

In [None]:
for y in ocr.keys():
  print(y)
  ocr[y].do_ocr()
  print()

We can view the list of 64 distinct symbols it found. Clearly they are only 0-9, "." and "-", but slightly different (e.g a few pixels shaved off here and there).

In [None]:
ocr[2024].show_symbols()


Even though the computer thinks there are 64 distinct symbols, there are clearly only 11: digits 0-9, "." and "-". We can visually identify them for the computer, providing a way to actually identify each symbol programmatically, we can parse all the entries in the tide table to get an array of seven columns, each containing ~1400 rows of (date, isDST, HW/LW, height):

In [None]:
from metadata import METADATA
from metadata import Location
results = dict()
#results[2016] = ocr[2016].parse_all(
#  "0345" + ".896" +
#  "7120" + "3859" +
#  "62"
#)
#results[2017] = ocr[2017].parse_all(
#  "0134" + ".279" +
#  "5869" + "0538" +
#  "6208" + "3628" +
#  "0623" + "9-"
#)
#results[2018] = ocr[2018].parse_all(
#  "0426" + ".138" +
#  "5973" + "0962" +
#  "8-"
#)
#results[2019] = ocr[2019].parse_all(
#  "016." + "7438" +
#  "2598" + "3059" +
#  "26-"
#)
#results[2020] = ocr[2020].parse_all(
#  "034." + "8927" +
#  "1565" + "3602" +
#  "89-"
#)
#results[2021] = ocr[2021].parse_all(
#  "047." + "1658" +
#  "3299" + "8560" +
#  "32"
#)
#results[2022] = ocr[2022].parse_all(
#  "034." + "8956" +
#  "1275" + "2021" +
#  "4073" + "8806" +
#  "3600" + "6913" +
#  "5393" + "6086" +
#  "8599" + "9162" +
#  "2962" + "8899" +
#  "-"
#)
#results[2023] = ocr[2023].parse_all(METADATA[2023][6])
results[2024] = ocr[2024].parse_all(METADATA[2024][Location.LONDON_BRIDGE])


Now we can get a whole year of data for London Bridge, in human-readable format:

In [None]:
YEAR = 2024
LOCATION = Location.LONDON_BRIDGE
old_date = ""
old_height = 0.0
deltas = []
times = []
g_stat = [0] * 50
old_dt = results[YEAR][LOCATION][0][0]
flood = [0] * 14  # 14 time buckets, each 10 mins wide, starting at 5h
ebb =   [0] * 14
longs = []
shorts = []
for dt, dst, id, h in results[YEAR][LOCATION]:
  g = dt - old_dt
  g = int(g.seconds/600)
  g_stat[g] += 1
  if g != 0:
    if id == "HW":
      flood[g-30] += 1
    else:
      ebb[g-30] += 1
  old_dt = dt
  d = dt.strftime("%Y-%m-%d:")
  t = dt.strftime("%H%M")
  if g == 31:
    shorts.append(dt)
  if g == 43:
    longs.append(dt)
  col1 = d if d != old_date else "           "
  print("{} {} {}{}({}m)".format(col1, id, t, "*" if dst else "", h))
  deltas.append(abs(h-old_height))
  times.append(dt)
  old_date = d
  old_height = h


In [None]:
print("Buckets:")
j = 0
for i in g_stat:
  if j != 0 and i > 0:
    print(f"  {j}: {i}")
  j += 1


We can print the days with unusually short, and long tides:

In [None]:
import datetime

# Render a datetime as a string
def to_str(dt):
  return datetime.datetime.strftime(dt, "%Y-%m-%dT%H:%MZ")

print("Short-duration tides:")
for tide in shorts:
  print(f"  {to_str(tide)}")
print("\nLong-duration tides:")
for tide in longs:
  print(f"  {to_str(tide)}")


We can plot how the tide range varies from week to week:

In [None]:
from matplotlib import pyplot as plt, dates as pltdates

x = []
y = []
old_ord = times[0].toordinal()
sum = 0
count = 0
for t, d in zip(times[1:], deltas[1:]):
  this_ord = t.toordinal()
  if this_ord != old_ord:
    x.append(datetime.datetime.fromordinal(old_ord))
    y.append(sum/count)
    sum = 0
    count = 0
    old_ord = this_ord
  sum = sum + d
  count = count + 1

x.append(datetime.datetime.fromordinal(old_ord))
y.append(sum/count)

days = pltdates.WeekdayLocator(byweekday=pltdates.TU)
fig, ax = plt.subplots(1, figsize=(100, 15))
ax.plot(x, y)
ax.xaxis.set_major_locator(days)
fig.autofmt_xdate()
plt.show()


And we can find the peaks and troughs (i.e the Springs & Neaps):

In [None]:
from pandas import Series
from scipy.signal import find_peaks

s = Series(y)
springs, _ = find_peaks(s)
neaps, _ = find_peaks(-s)
print("Springs:")
for k in springs:
  print(f"  {str(x[k])[:10]}: {y[k]:.3f}")
print("\nNeaps:")
for k in neaps:
  print(f"  {str(x[k])[:10]}: {y[k]:.3f}")


And we can see the distribution of durations, for flood and ebb:

In [None]:
import numpy as np

ranges = [
  "5h00-5h09",
  "5h10-5h19",
  "5h20-5h29",
  "5h30-5h39",
  "5h40-5h49",
  "5h50-5h59",
  "6h00-6h09",
  "6h10-6h19",
  "6h20-6h29",
  "6h30-6h39",
  "6h40-6h49",
  "6h50-6h59",
  "7h00-7h09",
  "7h10-7h19"
]
fig, ax = plt.subplots(1, figsize=(20, 10))
x_axis = np.arange(len(ranges))
ax.bar(x_axis -0.2, flood, width=0.4, label="Flood")
ax.bar(x_axis +0.2, ebb, width=0.4, label = "Ebb")
plt.xticks(x_axis, ranges)
plt.legend()
plt.show()


Finally, we can look for instances of unusually-low water at London Bridge, and unusually-high water at North Woolwich:

In [None]:
import datetime

print("Unusually low, at London Bridge:")
for i in results[YEAR][Location.LONDON_BRIDGE]:
  if i[3] < 0.1:
    print(f"  {to_str(i[0])}: {i[3]}m")

print("\nUnusually high, at North Woolwich:")
for i in results[YEAR][Location.NORTH_WOOLWICH]:
  if i[3] > 7.6:
    print(f"  {to_str(i[0])}: {i[3]}m")
