# Play with Jupyter and OpenCV
Load an entire page, crop to get only the times & depths, detect all the symbols, and extract the columns of symbols:

In [None]:
import tide_ocr

tide_ocr.display_cropped("pages/20.png", tide_ocr.EVEN_OFFSET)


We can use OpenCV to extract bounding-boxes for all the symbols on a page, and using the knowledge that each image forms several columns of digits, we can extract a row of symbols and view them.

In [None]:
# View the symbols from rows 8, 9, 10 & 11 from the "London Bridge" column on page 20
tide_ocr.display_column("pages/20.png", tide_ocr.EVEN_OFFSET, 6, 4, 8)


Now we can read all 48 pages, building up a list of unique symbols. We expect to see only 0-9, "." and "-". This works by maintaining a list of symbols, and xor'ing each new symbol with all the known ones. An xor of two identical symbols will produce an all-black result. An xor of two almost-identical symbols will produce a result with fewer than 30 white pixels. This number 30 is kinda arbitrary. Set it lower and it will find more symbols that it thinks are distinct.

In [None]:
ocr = tide_ocr.SimpleOCR("pages", 0, 48)


We can view the list of 64 distinct symbols it found. Clearly they are only 0-9, "." and "-", but slightly different (e.g a few pixels shaved off here and there).

In [None]:
ocr.show_symbols()


Even though the computer thinks there are 64 distinct symbols, there are clearly only 12: digits 0-9, "." and "-". We can visually identify them for the computer, providing a way to actually identify each symbol programmatically, we can parse all the entries in the tide table to get an array of seven columns, each containing ~1400 rows of (date, isDST, HW/LW, height):

In [None]:
results = ocr.parse_all(
  "034." + "8956" +
  "1270" + "5221" +
  "3007" + "3628" +
  "8693" + "6006" +
  "9783" + "8539" +
  "3610" + "8655" +
  "3990" + "5962" +
  "6296" + "289-"
)

# Let's have a look at the data for London Bridge from page 20
i = 0
for dt, dst, id, h in results[6][583:610]:
  print(f"results[{i:02d}] = ({dt.strftime('%Y-%m-%dT%H:%M')}, {dst}, {id}, {h})")
  i = i + 1


Now we can get a whole year of data for London Bridge, in human-readable format:

In [None]:
old_date = ""
old_height = 0.0
deltas = []
times = []
old_dt = results[6][0][0]
flood = [0] * 11  # 11 time buckets, each 10 mins wide
ebb = [0] * 11
longs = []
for dt, dst, id, h in results[6]:
  g = dt - old_dt
  g = int(g.seconds/600)
  if g != 0:
    if id == "HW":
      flood[g-32] = flood[g-32] + 1
    else:
      ebb[g-32] = ebb[g-32] + 1
  old_dt = dt
  d = dt.strftime("%Y-%m-%d:")
  t = dt.strftime("%H%M")
  if g == 42:
    longs.append(d[:-1])
  col1 = d if d != old_date else "           "
  #print("  {} {} {}{}({}m, {:.1f}m, {})".format(col1, id, t, "*" if dst else "", h, abs(h-old_height), g))
  print("  {} {} {}{}({}m)".format(col1, id, t, "*" if dst else "", h))
  deltas.append(abs(h-old_height))
  times.append(dt)
  old_date = d
  old_height = h


We can print the days with unusually long tides:

In [None]:
for tide in longs:
  print(tide)


We can plot how the tide range varies from week to week:

In [None]:
from matplotlib import pyplot as plt, dates as pltdates

days = pltdates.WeekdayLocator(byweekday=pltdates.TU)
fig, ax = plt.subplots(1, figsize=(100, 15))
#ax.plot(times[1:100], deltas[1:100])
ax.plot(times[1:300], deltas[1:300])
ax.xaxis.set_major_locator(days)
# rotate and align the tick labels so they look better
fig.autofmt_xdate()
plt.show()


And we can see the distribution of durations, for flood and ebb:

In [None]:
import numpy as np

ranges = [
  "5h20-5h29",
  "5h30-5h39",
  "5h40-5h49",
  "5h50-5h59",
  "6h00-6h09",
  "6h10-6h19",
  "6h20-6h29",
  "6h30-6h39",
  "6h40-6h49",
  "6h50-6h59",
  "7h00-7h09"
]
fig, ax = plt.subplots(1, figsize=(20, 10))
x_axis = np.arange(len(ranges))
ax.bar(x_axis -0.2, flood, width=0.4, label="Flood")
ax.bar(x_axis +0.2, ebb, width=0.4, label = "Ebb")
plt.xticks(x_axis, ranges)
plt.legend()
plt.show()


Also, we can produce 2D Javascript arrays for times and depths:

In [None]:
import datetime

num_results = min([len(r) for r in results])
zerotime = datetime.datetime(tide_ocr.YEAR, 1, 1, tzinfo=datetime.timezone.utc)

# 2136, 2159, 2207, 2223, 2257, 2326, 2346
# ^^^ Times from 2021-12-31
#  
times = [2136, 2159, 2207, 2223, 2257, 2326, 2346]
depths = [4.0,  4.4,  4.9,  5.4,  6.0,  6.7,  6.6]
times = [[60*int(t/100) + t%100 - 24*60] for t in times]
depths = [[d] for d in depths]
for j in range(len(results)):
    times[j].extend([int((results[j][i][0]-zerotime).total_seconds()/60) for i in range(num_results)])
    depths[j].extend([results[j][i][3] for i in range(num_results)])
print(f"            const times = {times};");
print(f"            const depths = {depths};");


Finally, we can look for instances of unusually-low water at London Bridge, and unusually-high water at North Woolwich:

In [None]:
print("Unusually low, at London Bridge:")
for i in results[6]:
  if i[3] < 0:
    print(f"  {i[0]}")

print("\nUnusually high, at North Woolwich:")
for i in results[5]:
  if i[3] > 7.4:
    print(f"  {i[0]}")
