In [3]:
import os
from pymongo import MongoClient
import pymongo
from tqdm import tqdm
import re
import pickle
import json
import torch

from langchain.docstore.document import Document
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.embeddings import SentenceTransformerEmbeddings


from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import json
from bson import ObjectId

import json
import pandas as pd
from bs4 import BeautifulSoup

In [5]:
from bson import ObjectId
import json

def json_serialize(obj):
    """
    JSON serializer for objects not serializable by default json code
    """
    if isinstance(obj, ObjectId):
        return str(obj)
    raise TypeError ("Type %s not serializable" % type(obj))
    
def json_deserialize(obj):
    # Implement deserialization if needed, e.g., converting string back to datetime
    return obj


In [6]:

with open('filtered_tables.json', 'r') as file:
    filtered_tables = json.load(file)

In [7]:
filtered_tables[1]

{'_id': '5c365d79a151585fa39284a5',
 'paper_doi': '10.1016/j.conbuildmat.2011.11.023',
 'order': 1,
 'act_table': [['Sample',
   'SiO2',
   'Al2O3',
   'Fe2O3',
   'MnO',
   'MgO',
   'CaO',
   'Na2O',
   'K2O',
   'TiO2',
   'P2O5',
   'Cr2O3',
   'LOI'],
  ['S1',
   '1.410',
   '0.350',
   '0.110',
   '0.020',
   '1.190',
   '53.430',
   '0.020',
   '0.040',
   '0.010',
   '0.010',
   '0.001',
   '43.330'],
  ['S2',
   '1.310',
   '0.430',
   '0.140',
   '0.030',
   '1.230',
   '53.520',
   '0.050',
   '0.020',
   '0.010',
   '0.010',
   '0.001',
   '43.150'],
  ['S3',
   '0.730',
   '0.110',
   '0.030',
   '0.040',
   '2.670',
   '49.740',
   '0.050',
   '0.030',
   '0.010',
   '0.010',
   '0.001',
   '46.510'],
  ['S4',
   '0.410',
   '0.410',
   '0.040',
   '0.020',
   '0.540',
   '53.680',
   '0.030',
   '0.010',
   '0.010',
   '0.010',
   '0.001',
   '44.810'],
  ['S5',
   '0.510',
   '0.230',
   '0.280',
   '0.050',
   '0.210',
   '54.710',
   '0.040',
   '0.010',
   '0.010',
 

In [8]:
def is_real_table(table):
    """ Check if the given table is a real table. """
    # Check if table is not empty
    if not table or not table[0]:
        return False

    # Check for meaningful content (not just placeholders)
    for row in table:
        if all(cell in [None, "", 0, "0"] for cell in row):
            return False

    return True

# Count real and not real tables
real_tables = 0
not_real_tables = 0

for item in filtered_tables:
    if is_real_table(item['act_table']):
        real_tables += 1
    else:
        not_real_tables += 1

print(f"Real tables: {real_tables}")
print(f"Not real tables: {not_real_tables}")


Real tables: 106130
Not real tables: 12154


In [18]:
def process_table_captions(tables):
    all_captions = []
    seen_captions = set()  # To track duplicates along with DOIs

    for table in tqdm(tables, desc="Processing tables"):
        # Extract the DOI and caption from the table record
        paper_doi = table['paper_doi']
        caption = table['caption']
        actual_table = table['act_table']
        # Create a unique key combining DOI and caption to check for duplicates
        unique_key = (paper_doi, caption)

        # Process the caption if it's a string and the unique key is not already seen
        if isinstance(caption, str) and unique_key not in seen_captions:
            seen_captions.add(unique_key)

            # Store the caption along with its DOI
            doc_caption = Document(page_content=caption+'/n'+str(actual_table),
                                   metadata={"table": json.dumps(actual_table), "doi": paper_doi})
            all_captions.append(doc_caption)

    return all_captions


In [19]:
processed_tables = process_table_captions(filtered_tables)

Processing tables: 100%|██████████| 118284/118284 [00:03<00:00, 37518.85it/s]


In [20]:
len(processed_tables)

116217

In [23]:
# Count real and not real tables
real_tables = 0
not_real_tables = 0

for item in processed_tables:
    if is_real_table(json.loads(item.metadata['table'])):
        real_tables += 1
    else:
        not_real_tables += 1

print(f"Real tables: {real_tables}")
print(f"Not real tables: {not_real_tables}")

Real tables: 104258
Not real tables: 11959


In [25]:
json.loads(processed_tables[12000].metadata['table'])

[['Factor',
  'Case 1 [%]',
  'Case 2 [%]',
  'Case 3 [%]',
  'Case 4 [%]',
  'Case 5 [%]'],
 ['A', '6.47', '65.23', '6.59', '60.65', '0.72'],
 ['B', '18.65', '33.65', '87.79', '25.43', '98.06'],
 ['C', '74.76', '0.04', '5.18', '10.90', '0.41']]

In [26]:
# Define metadata field information
metadata_field_info = [
    AttributeInfo(name="table", description="Actual table content", type="list"),
    AttributeInfo(name="doi", description="Digital Object Identifier of the document", type="string"),
]

In [27]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
persist_directory = 'data/chroma/etsminilm'


In [28]:
svdb = Chroma.from_documents(processed_tables, embedding=embeddings, persist_directory=persist_directory)

In [29]:
svdb.persist()

In [46]:
question = "major oxides or chemical compositions of materials including CaO, SiO2, MgO, Al2O3"
docs = svdb.similarity_search(
    question,
    k=60)

for doc  in docs: print(doc)

page_content="Oxide composition of MgO [36]./n[['Composition', 'MgO', 'Fe2O3', 'SiO2', 'CaO', 'Others'], ['Mass fraction of the sample (%)', '92.53', '0.87', '3.1', '1.6', '1.9']]" metadata={'doi': '10.1016/j.conbuildmat.2018.07.037', 'table': '[["Composition", "MgO", "Fe2O3", "SiO2", "CaO", "Others"], ["Mass fraction of the sample (%)", "92.53", "0.87", "3.1", "1.6", "1.9"]]'}
page_content="The oxide compositions of MgO powders, limestone powders and silica fume./n[['Sample name', 'Oxide compositions (%)', 'Oxide compositions (%)', 'Oxide compositions (%)', 'Oxide compositions (%)', 'Oxide compositions (%)', 'Oxide compositions (%)', 'Oxide compositions (%)'], ['Sample name', 'MgO', 'CaO', 'SiO2', 'Al2O3', 'Fe2O3', 'CO2', 'Others'], ['MgO Powders', '91.85', '3.14', '3.68', '0.16', '0.87', '-', '0.30'], ['Limestone powders', '8.44', '36.92', '8.91', '1.36', '0.52', '43.2', '4.91'], ['Silica fume', '1.82', '0.52', '89.8', '0.13', '1.55', '-', '6.18']]" metadata={'doi': '10.1016/j.conbui