In [115]:
import os
import sys
import re
import pandas as pd
import numpy as np
import json
from pathlib import Path
import random
import logging
import uuid

In [144]:
# load the data
poly_lib='../extractions-from-paper/polymer_library.xlsx'
poly_names=pd.read_excel(poly_lib)
synthesis_file='../extractions-from-paper/synthetic_procedures.xlsx'
synth=pd.read_excel(synthesis_file)
properties=pd.read_csv('../extractions-from-paper/properties.csv')
properties.drop(index=0,inplace=True)
properties["Polymer Number"]=np.array(properties["Polymer Number"].values,dtype=int)
poly_names=poly_names.merge(properties,left_on="Polymer Number", right_on="Polymer Number")
poly_names=poly_names.merge(synth,left_on="Method Key", right_on="Label")

In [146]:
def generate_unique_id(existing_ids):
    """
    Generate a new unique custom ID.
    Args:
        existing_ids (List[str]): A list of existing IDs to check against.
    Returns:
        str: A new unique ID.
    """
    while True:
        new_id = uuid.uuid4().hex
        if new_id not in existing_ids:
            return new_id

In [156]:
def data_record_to_context(record):
    """
    Convert a database record to context.

    Args:
        record (pd.Series): A row from the database.

    Returns:
        string: A context text.
    """
    solubility_text='dichloromethane at a concentration of 40 mg/mL at room temperature'
    physical_state_text='Physical state of the polymer determined by visual observation and polarized microscope analysis'
    degradability_text='in a clear zone assay using Pseudomonas lemoignei as the microorganism'
    if pd.isna(record['Monomer 2']):
        Synth=f"{record['Polymer class']} was synthesysed using {record['Type']} in {record['Medium']} using monomer {record['Monomer 1']}. BigSMILES of the polymer is {record['BigSMILES']}. "
    else:
        Synth=f"{record['Polymer class']} was synthesysed using {record['Type']} in {record['Medium']} using monomers {record['Monomer 1']} and {record ['Monomer 2']}. The mole ratio of monomer 1 to monomer 2 in the polymer is {record['Monomer 1: Monomer 2 (mole basis)']}. BigSMILES of the polymer is {record['BigSMILES']}. "
    if(record['Label']=='M33' or record['Label']=='M34'):
        if pd.isna(record['Monomer 2']):
            Copol_homopol=f'Resulting polymer is a homopolymer. '
        else:
            Copol_homopol=f'Resulting polymer is a copolymer. '
    else:
        if pd.isna(record['Monomer 2']):
            Copol_homopol=f'Resulting polymer is a homopolymer. '
        else:
            Copol_homopol=f"Resulting polymer is {record['Product']}. "
    if(record['Mn (kDa)']>0 and record['Mw (kDa)']>0):
        Mol_weigt=f"Number average molecular weight, Mn, of the polymer is {record['Mn (kDa)']} kDa. Weight average molecular weight, Mw, of the polymer is {record['Mw (kDa)']} kDa. "
    else:
        Mol_weigt=''
    if(record['Solubility']=='yes'):
        Solubility=f'Resulting polymer is soluble in {solubility_text}. '
    elif(record['Solubility']=='no'):
        Solubility=f'Resulting polymer is insoluble in {solubility_text}. '
    else:
        Solubility=''
    
    Physical_state=f"Physical state is {record['Physical State']}. {physical_state_text}. "
    
    if(record['Biodegradability']=='yes'):
        Degradability=f"Biodagradable in {degradability_text}."
    elif(record['Biodegradability']=='no'):
        Degradability=f"Non-biodagradable in {degradability_text}."
    else:
        Degradability=''
    Context=Synth+Copol_homopol+Mol_weigt+Solubility+Physical_state+Degradability
    return Context

In [159]:
def data_record_to_QAs(record):
    """
    Convert a database record to question-answer pairs.

    Args:
        record (pd.Series): A row from the thermoelectric database.

    Returns:
        tuple: A tuple containing two lists - questions and answers.
    """
    questions=[]
    answers=[]
    questions.append(f"What monomers were used for polymer synthesis?")
    if pd.isna(record['Monomer 2']):
        answers.append(f"{record['Monomer 1']}")
    else:
        answers.append(f"{record['Monomer 1']} and {record['Monomer 2']}")
        questions.append(f"What was the Monomer1:Monomer2 molar ratio?")
        answers.append(f"{record['Monomer 1: Monomer 2 (mole basis)']}")
    questions.append("What synthesis method was used?")
    answers.append(f"{record['Type']}")
    questions.append(f"Was reaction conducted in bulk (melt), solution, or at solution interface?")
    answers.append(f"{record['Medium']}")
    if not pd.isna(record['Mn (kDa)']):
        questions.append(f"What is the value of number average molecular weight of the polymer?")
        answers.append(f"{record['Mn (kDa)']}")
        questions.append(f"From which monomers was synthesised polymer with number average molecular weight {record['Mn (kDa)']} kDa?")
        if pd.isna(record['Monomer 2']):
            answers.append(f"{record['Monomer 1']}")
        else:
            answers.append(f"{record['Monomer 1']} and {record['Monomer 2']}")
        questions.append(f"Is polymer with number average molecular weight {record['Mn (kDa)']} kDa biodegradable in a clear zone assay using Pseudomonas lemoignei as the microorganism?")
        if(record['Biodegradability']=='yes'):
            answers.append('Biodagradable')
        else:
            answers.append('Non-biodagradable') 
        questions.append(f"Is polymer with weight average molecular weight {record['Mn (kDa)']} kDa soluble in dichloromethane at a concentration of 40 mg/mL at room temperature?")
        if(record['Solubility']=='yes'):
            answers.append('soluble')
        else:
            answers.append('insoluble')
    if not pd.isna(record['Mw (kDa)']):
        questions.append(f"What is the value of weight average molecular weight of the polymer?")
        answers.append(f"{record['Mw (kDa)']}")
    questions.append(f"Is this polymer biodegradable in a clear zone assay using Pseudomonas lemoignei as the microorganism?")
    if(record['Biodegradability']=='yes'):
        answers.append('Biodagradable')
    else:
        answers.append('Non-biodagradable') 
    questions.append(f"Is this polymer soluble in dichloromethane at a concentration of 40 mg/mL at room temperature?")
    if(record['Solubility']=='yes'):
        answers.append('soluble')
    else:
        answers.append('insoluble')
    questions.append(f"In which physical state is the polymer polymer determined by visual observation and polarized microscope analysis")
    answers.append(f"{record['Physical State']}")
    questions.append(f"What is the BigSMILES string for the polymer")
    answers.append(f"{record['BigSMILES']}")

    return questions, answers

In [173]:
existing_ids=[]
data=[]
for i in range(len(poly_names)):
    record=poly_names.iloc[i]
    context=data_record_to_context(record)
    questions,answers=data_record_to_QAs(record)
    for q,a in zip(questions,answers):
        ind=generate_unique_id(existing_ids)
        existing_ids.append(ind)
        entry={
            'id': ind,
            'context':context,
            'question': q,
             "answers": [
                                {
                                    "text": [a],
                                    "answer_start": context.find(a)
                                }
                            ],
        }
        data.append(entry)

In [174]:
len(data)

6270

In [175]:
with open('../QA-dataset/qa_pairs_extractive.json','w') as file:
    json.dump(data,file)

In [24]:
qa_pairs=[]

for i in range(1,len(properties)):
    try:
        polymer_num=float(properties.iloc[i]['Polymer Number'])
        poly_names.loc[poly_names['Polymer Number']==polymer_num]
        mon1=poly_names.loc[poly_names['Polymer Number']==polymer_num]['Monomer 1'].values[0]
        mon2=poly_names.loc[poly_names['Polymer Number']==polymer_num]['Monomer 2'].values[0]
        mn=poly_names.loc[poly_names['Polymer Number']==polymer_num]['Mn (kDa)'].values[0]
        mw=poly_names.loc[poly_names['Polymer Number']==polymer_num]['Mw (kDa)'].values[0]
        D=poly_names.loc[poly_names['Polymer Number']==polymer_num]['D'].values[0]
        m_ratio=poly_names.loc[poly_names['Polymer Number']==polymer_num]['Monomer 1: Monomer 2 (mole basis)'].values[0]
        method=poly_names.loc[poly_names['Polymer Number']==polymer_num]['Method Key'].values[0]
        synth_m=synth.loc[synth['Label']==method]['Procedure'].values[0]
        bigsmiles=poly_names.loc[poly_names['Polymer Number']==polymer_num]['BigSMILES'].values[0]
        solubility=properties.iloc[i]['Solubility']
        phys_state=properties.iloc[i]['Physical State']
        biodegrad=properties.iloc[i]['Biodegradability']
        question=f'Is {mon1},{mon2} polymer with ratio {m_ratio} and number average molecular weight {mn} soluble in dichloromethane at a concentration of 40 mg/mL at room temperature?'
        answer=f'{solubility}'
        qa_pairs.append({'question':question, 'answer': answer})
        question=f'What is the physical state of {mon1},{mon2} polymer with ratio {m_ratio} and number average molecular weight {mn} determined by visual observation and polarized microscope analysis?'
        answer=f'{phys_state}'
        qa_pairs.append({'question':question, 'answer': answer})
        question=f'Is {mon1},{mon2} polymer with ratio {m_ratio} and number average molecular weight {mn} biodegradable in a clear zone assay using Pseudomonas lemoignei as the microorganism?'
        answer=f'{biodegrad}'
        qa_pairs.append({'question':question, 'answer': answer})
        question=f'What is the synthesis method {mon1},{mon2} polymer with ratio {m_ratio} and number average molecular weight {mn}?'
        answer=f'{synth_m}'
        qa_pairs.append({'question':question, 'answer': answer})
        question=f'What is the BigSMILES string for {mon1},{mon2} polymer?'
        answer=f'{bigsmiles}'
        qa_pairs.append({'question':question, 'answer': answer})
    except:
        continue

In [25]:
len(qa_pairs)

3290

In [135]:
with open('../QA-dataset/qa_pairs_fransen.json','w') as file:
    json.dump(qa_pairs,file)