In [1]:
import json

json_file = 'data/manufacturing-unit1/auto/manufacturing-unit1_content_list.json'

with open (json_file, 'r') as f:
    data = f.read()
    data = json.loads(data)

In [2]:
# Initialize lists to hold tables and images
tables = []
images = []

# Iterate over the JSON data
for item in data:
    if item.get("type") == "table":
        tables.append(item)
    elif item.get("type") == "image":
        images.append(item)

In [3]:
tables

[{'type': 'table',
  'img_path': 'images/77752325b37fca96e9566d52a5ffd6e4bf263bbf2119392810a62c4296efe3da.jpg',
  'table_caption': ['Table 1.1 '],
  'table_footnote': [],
  'page_idx': 22},
 {'type': 'table',
  'img_path': 'images/bf7323f0ad7886a60cd8b197b9ff34ab566239ec183aab27285e3270be3ebaf5.jpg',
  'table_caption': ['Table 2.1 '],
  'table_footnote': [],
  'page_idx': 31},
 {'type': 'table',
  'img_path': 'images/464934ab2ce0b04dfba26350a325df37dacfea69b1e6ed6bb18c113a25ddce59.jpg',
  'table_caption': ['Table 3.1 ',
   'Note: For comparison, tensile strength of Iron is $270\\;\\mathrm{N/mm}^{2}$ '],
  'table_footnote': [],
  'page_idx': 34}]

In [4]:
import re
# Updated regex pattern to match more specific figure references
image_pattern = re.compile(r"^(Fig\.|Figure)\s(\d+(\.\d+)?(\s*\([a-z]\))?)")
table_pattern = re.compile(r"^(Table)\s(\d+(\.\d+)?(\s*\([a-z]\))?)")


for image in images:
    for caption in image.get("img_caption"):
        description = []
        match = image_pattern.match(caption)
        if match:
          matched_part = match.group(0)  # Get the entire matched string
          image['ref'] = matched_part
          print(matched_part)
          print(caption)
          description.append(caption.replace(matched_part, '').strip())
        else:
          description.append(caption)
        image['description'] = ';'.join(description)


for table in tables:
    for caption in table.get("table_caption"):
        match = table_pattern.match(caption)
        description = []
        if match:
          matched_part = match.group(0)  # Get the entire matched string
          table['ref'] = matched_part
          print(matched_part)
          print(caption)
          description.append(caption.replace(matched_part, '').strip())
        else:
          description.append(caption)
        table['description'] = ';'.join(description)
        



Fig. 1.1
Fig. 1.1 Stress-strain curve for ductile material 
Fig. 1.2
Fig. 1.2 Dimensions of a standard tensile test-piece 
Fig. 1.3
Fig. 1.3 Stress-strain curve for brittle material 
Fig. 1.4 (a)
Fig. 1.4 (a) IZOD test specimen 
Fig. 1.4 (b)
Fig. 1.4 (b) Specimen fixed in IZOD testing machine 
Fig. 2.1
Fig. 2.1 Microstructure, mechanical properties, and uses of plain carbon steels 
Table 1.1
Table 1.1 
Table 2.1
Table 2.1 
Table 3.1
Table 3.1 


In [5]:
with open('data/manufacturing-unit1/auto/image-table-list.json', 'w') as f:
    json.dump(images+tables, f, indent=2)

In [7]:
# in the markdown file, search for the image/table reference and insert the file name like: ![Figure 1](images/figure1.png)

markdown_file = 'data/manufacturing-unit1/auto/manufacturing-unit1.md'
out_file = 'data/manufacturing-unit1/auto/manufacturing-unit1-processed.md'
with open(markdown_file, 'r') as f:
    content = f.read()

# remove all file names like ![](images/figure1.png)
content = re.sub(r'!\[\]\(images/.*?\)', '', content)

for item in images+tables:
  item['description'] = item.get('ref')
  # one of the captions will contain the reference
  for caption in item.get('img_caption') if item.get('type') == 'image' else item.get('table_caption'):
    if item.get('ref') in caption:
      item['description'] = caption
      break

  if item.get('ref'):
    content = content.replace(item.get('ref'), f"![{item['description']}]({item.get('img_path')})")

with open(out_file, 'w') as f:
    f.write(content)

