In [1]:
# You need to install the following packages:
#   opencv-python
#   pytesseract
#   scipy
#   pandas
import re
import pytesseract
import cv2
import pandas as pd
from scipy.stats import spearmanr, pearsonr

In [2]:
# Read a png image of a table
imgpath = 'Selection_063.png'
img = cv2.imread(imgpath)

In [3]:
# Convert to strings
text = pytesseract.image_to_string(img)

`splitlines()` attribute of a string object separates the object
at every occurrence of '\n', the NewLine character.

In [5]:
# Showing the first 10 items
text.splitlines()[0:10]

['(Left Leg) _(Right Leg)',
 '',
 '1173 1304',
 '1336 1228',
 '1076 1254',
 '913 933',
 '2006 2274',
 '2126 2075',
 '1247 1309',
 '1274 1292']

`strip()` attribute of a string object strips whitespace
at both ends of the object

In [6]:
strObj = '  253 382 '
strObj.strip()

'253 382'

In [7]:
strObj2 = ''
strObj2.strip()

''

Use the two attributes and Line Comprehension to create a list: `pairs`.

The Line Comprehension feature makes your code more concise, readable, and FAST!
You can read more about it from:
    https://www.w3schools.com/python/python_lists_comprehension.as

In [8]:
pairs = [line for line in text.splitlines() if line.strip()]

Depending on the quality of the image,
your text items may contain some noise.
Here are some examples:

In [9]:
# See the comma(',') at the end
pairs[9]

'1696 1440,'

In [10]:
# See the opening parenthesis in the beginning
pairs[36]

'(2132, 2053'

I only want numbers, not "(" or "," or any other special character.
Also, the first item of `pairs` is a tuple of column names

In [11]:
pairs[0]

'(Left Leg) _(Right Leg)'

To handle regular expression (regex),
use `re.sub()` of `re` package.

In [12]:
# See how the comma at the end is removed
re.sub('\\D', '', '3538,')

'3538'

You can write a for-loop, iterating over `pairs`.
At each iteration, you will split a string (ex. '1696 1623')
using `split()` attribute

In [13]:
# separated by whitespace in the middle
'1696 1623'.split()

['1696', '1623']

Then you can save the first string to a list saving the
<left leg movement rates>. You can save the second string to
another list saving the <right leg movement rates>.
Use `append` attribute of a list object.
Additionally, you can convert each string to a number before
saving, using `float()`.

You can skip the first item of `pairs`, because you _know_ that
it is just column names, not actual values.
If you _do not know_ such is true, then you can use `re.search()`

In [14]:
# Start by preparing two empty lists
lmovrates = []
rmovrates = []
# Start a loop
for line in pairs:
    # re.search(r'\d', string) will be True
    # if 'string' has a digit (ex. '3499) 3671')
    if re.search(r'\d', line):
        # 'movrates' will be a list of two strings
        # (ex. ['3499)', '3671'])
        movrates = line.split()
        # re.sub('\\D', '', movrates[0]) returns a string of digits
        # float() will transform that string to a float
        # `append` each item to the corresponding list
        lmovrates.append(float(re.sub('\\D', '', movrates[0])))
        rmovrates.append(float(re.sub('\\D', '', movrates[1])))

You can replace the for-loop above with the following two lines:

In [16]:
numstr_pairs = [line.split() for line in pairs if re.search(r'\d', line)]
lmovrates, rmovrates = zip(*(map(lambda x: float(re.sub('\\D', '', x)), numstr)\
                             for numstr in numstr_pairs))

Here comes a detailed explanation about the TWO lines above.

`map` is a function that maps a function to every item of a list.
It will return a map object.
Learn more about map object from here:
    https://realpython.com/python-map-function/

In [17]:
map(float, ['12', '34', '56', '78'])

<map at 0x78187ac6b1f0>

The map object in the example above can be read with `list()` function. The map object should contain four numbers, converted from strings using `float`.

In [18]:
list(map(float, ['12', '34', '56', '78']))

[12.0, 34.0, 56.0, 78.0]

`*` is the unpacking operator. This is often used with `zip` function.

`zip` is a function aggregating elements from multiple iterables into tuples.
(https://realpython.com/python-zip-function)
Similar to `map`, `zip` also generates a zip object.

In [19]:
# See how `zip` and `*` are used in combination.
# `*` is unpacking the list [[12, 34], [56, 78]] to two separate lists:
#    [12, 34] and [56, 78]
# Then `zip` will aggregate elements from the two lists into tuples.
# 12 and 56 will make one tuple, and 34 and 78 will make another tuple.

zip(*[[12, 34], [56, 78]])

<zip at 0x78188000e800>

Check the two tuples in the zip object.

In [20]:
list(zip(*[[12, 34], [56, 78]]))

[(12, 56), (34, 78)]

More than two lists can be provided as input.

In [21]:
list(zip(*[[12, 34], [56, 78], [90, 123]]))

[(12, 56, 90), (34, 78, 123)]

In [22]:
# Calculate the correlation coefficients and associated p-values
rs, rs_p = spearmanr(lmovrates, rmovrates)
print(f'Spearman Rho: {rs}, p-value: {rs_p}')

rp, rp_p = pearsonr(lmovrates, rmovrates)
print(f'Pearson R: {rp}, p-value: {rp_p}')

# You can export the values to a csv file
pd.DataFrame({'lmovrates':lmovrates, 'rmovrates':rmovrates}).to_csv('movrates.csv', index=False)

Spearman Rho: 0.9045627150464127, p-value: 3.8517126368172185e-14
Pearson R: 0.9541193734116105, p-value: 2.2186650538231103e-19
