In [11]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#Model
model = BertForQuestionAnswering.from_pretrained("allenai/scibert_scivocab_cased")

#Tokenizer
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_cased")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
"allenai/scibert_scivocab_cased" # "m3rg-iitd/matscibert"  

In [2]:
question = '''What is Machine Learning?'''

paragraph = ''' Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task. Machine learning algorithms build a mathematical model of sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics. '''
            
encoding = tokenizer.encode_plus(text=question,text_pair=paragraph)

inputs = encoding['input_ids']  #Token embeddings
sentence_embedding = encoding['token_type_ids']  #Segment embeddings
tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

In [3]:
start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))


In [5]:
start_scores

'start_logits'

In [6]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="allenai/scibert_scivocab_cased")
question_answerer(question=question, context=paragraph)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'score': 0.00014875772467348725,
 'start': 771,
 'end': 793,
 'answer': 'theory and application'}

In [1]:
import pandas as pd
df = pd.read_parquet('/mnt/hdd1/LaMDa/mp_3d_2020_gpt_narratives.parquet')

In [8]:
df.keys()

Index(['atoms', 'space group symbol', 'crystal system',
       'energy per atom (eV/atom)', 'volume (Å³)',
       'formation energy per atom (eV/atom)', 'pretty formula',
       'energy above hull (eV/atom)', 'band gap (eV)', 'density (g/cm³)',
       'total magnetization (μB/f.u.)', 'oxide type',
       'scintillation attenuation length (cm)', 'enthalpy per atom (eV/atom)',
       'gpt_text', 'gpt_explanation', 'reduced_formula', 'text',
       'structure_question_list', 'composition_question_list',
       'stable_question_list', 'is_stable', 'oxide_question_list',
       'comp_struc_question_list'],
      dtype='object')

In [9]:
df['structure_question_list'][0]

array(['This material is cubic.', 'This material is tetragonal.',
       'This material is hexagonal.', 'This material is orthorhombic.',
       'This material is trigonal.', 'This material is monoclinic.',
       'This material is triclinic.'], dtype=object)

In [8]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from tqdm import tqdm
import torch
import pandas as pd

df = pd.read_parquet('/mnt/hdd1/LaMDa/mp_3d_2020_gpt_narratives.parquet')
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
model = AutoModelForMultipleChoice.from_pretrained("allenai/scibert_scivocab_cased")


Some weights of BertForMultipleChoice were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model = model.cuda()
answer_list = []
for i, row in tqdm(df.iterrows()):
    try:
        row['atoms'].pop('coords')
        row['atoms'].pop('props')
        prompt = 'Which of the following statements is most correct regarding the given material?' + str(row['atoms'])
        inputs = tokenizer([[prompt, candidate] for candidate in row['structure_question_list']], return_tensors="pt", padding=True)
        labels = torch.tensor(0).unsqueeze(0).cuda()

        outputs = model(**{k: v.unsqueeze(0).cuda() for k, v in inputs.items()}, labels=labels)
        logits = outputs.logits
        answer_list.append(logits.argmax().item())
    except:
        answer_list.append(-1)

126335it [1:28:09, 23.89it/s]


In [15]:
# count -1 
sum([i == 0 for i in answer_list])/len(answer_list)

0.10228361103415522

In [18]:
model = model.cuda()
comp_answer_list = []
for i, row in tqdm(df.iterrows()):
    try:
        # row['atoms'].pop('coords')
        # row['atoms'].pop('props')
        prompt = 'Which of the following statements is most correct regarding the given material?' + str(row['atoms'])
        inputs = tokenizer([[prompt, candidate] for candidate in row['composition_question_list']], return_tensors="pt", padding=True)
        labels = torch.tensor(0).unsqueeze(0).cuda()

        outputs = model(**{k: v.unsqueeze(0).cuda() for k, v in inputs.items()}, labels=labels)
        logits = outputs.logits
        comp_answer_list.append(logits.argmax().item())
    except:
        comp_answer_list.append(-1)

126335it [2:17:48, 15.28it/s]


In [19]:
sum([i == 0 for i in comp_answer_list])/len(comp_answer_list), sum([i == -1 for i in comp_answer_list])

(0.5580638777852535, 11652)

In [20]:
model = model.cuda()
oxide_answer_list = []
for i, row in tqdm(df.iterrows()):
    try:
        # row['atoms'].pop('coords')
        # row['atoms'].pop('props')
        prompt = 'Which of the following statements is most correct regarding the given material?' + str(row['atoms'])
        inputs = tokenizer([[prompt, candidate] for candidate in row['oxide_question_list']], return_tensors="pt", padding=True)
        labels = torch.tensor(0).unsqueeze(0).cuda()

        outputs = model(**{k: v.unsqueeze(0).cuda() for k, v in inputs.items()}, labels=labels)
        logits = outputs.logits
        oxide_answer_list.append(logits.argmax().item())
    except:
        oxide_answer_list.append(-1)

0it [00:00, ?it/s]

126335it [1:17:41, 27.10it/s]


In [21]:
sum([i == 0 for i in oxide_answer_list])/len(oxide_answer_list), sum([i == -1 for i in oxide_answer_list])

(0.4933312225432382, 7722)

In [22]:
model = model.cuda()
comp_struc_answer_list = []
for i, row in tqdm(df.iterrows()):
    try:
        # row['atoms'].pop('coords')
        # row['atoms'].pop('props')
        prompt = 'Which of the following statements is most correct regarding the given material?' + str(row['atoms'])
        inputs = tokenizer([[prompt, candidate] for candidate in row['comp_struc_question_list']], return_tensors="pt", padding=True)
        labels = torch.tensor(0).unsqueeze(0).cuda()

        outputs = model(**{k: v.unsqueeze(0).cuda() for k, v in inputs.items()}, labels=labels)
        logits = outputs.logits
        comp_struc_answer_list.append(logits.argmax().item())
    except:
        comp_struc_answer_list.append(-1)

0it [00:00, ?it/s]

126335it [1:30:43, 23.21it/s]


In [23]:
sum([i == 0 for i in comp_struc_answer_list])/len(comp_struc_answer_list), sum([i == -1 for i in comp_struc_answer_list])

(0.08834448094352318, 8520)

In [21]:
inputs['input_ids'].shape

torch.Size([7, 917])

In [29]:
row['atoms']



{'abc': array([ 5.83847,  9.16937, 16.52476]),
 'angles': array([90., 90., 90.]),
 'cartesian': False,
 'elements': array(['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
        'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
        'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
        'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
        'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
        'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
        'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
        'B', 'B', 'B', 'B', 'B', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
        'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O'],
       dtype=object),
 'lattice_mat': array([array([5.8384700e+00, 0.0000000e+00, 3.5750318e-16]),
        array([0.00000000e+00, 9.16937400e+00, 5.61462226e-16]),
        array([ 0.      ,  0.      , 16.524762])], dtype=object)}

In [6]:
import pandas as pd
from tqdm import tqdm
df = pd.read_parquet('/mnt/hdd1/LaMDa/mp_3d_2020_gpt_narratives.parquet')

for i, row in df.iterrows():
    row['atoms'].pop('coords')
    row['atoms'].pop('props')

In [7]:
comp_data = {f'ending{i}': [] for i in range(len(df['composition_question_list'][0]))}
comp_data['sent1'] = []
comp_data['sent2'] = []
comp_data['label'] = []

for i, row in tqdm(df.iterrows()):
    comp_data['sent1'].append('Which of the following statements is most correct regarding the given material?')
    comp_data['sent2'].append(str(row['atoms']))
    comp_data['label'].append(0)
    for j in range(len(row['composition_question_list'])):
        comp_data[f'ending{j}'].append(row['composition_question_list'][j])

0it [00:00, ?it/s]

126335it [01:08, 1846.88it/s]


In [8]:
temp = pd.DataFrame(comp_data)
temp_train = temp.iloc[:int(len(temp)*0.8)]
temp_valid = temp.iloc[int(len(temp)*0.8):int(len(temp)*0.9)]
temp_test = temp.iloc[int(len(temp)*0.9):]
temp_train.to_json('/mnt/hdd1/LaMDa/comp_data_train.json', lines=True, orient='records')
temp_valid.to_json('/mnt/hdd1/LaMDa/comp_data_valid.json', lines=True, orient='records')
temp_test.to_json('/mnt/hdd1/LaMDa/comp_data_test.json', lines=True, orient='records')

In [9]:
str_data = {f'ending{i}': [] for i in range(len(df['structure_question_list'][0]))}
str_data['sent1'] = []
str_data['sent2'] = []
str_data['label'] = []

for i, row in df.iterrows():
    str_data['sent1'].append('Which of the following statements is most correct regarding the given material?')
    str_data['sent2'].append(str(row['atoms']))
    str_data['label'].append(0)
    for j in range(len(row['structure_question_list'])):
        str_data[f'ending{j}'].append(row['structure_question_list'][j])

In [10]:
temp = pd.DataFrame(str_data)
temp_train = temp.iloc[:int(len(temp)*0.8)]
temp_valid = temp.iloc[int(len(temp)*0.8):int(len(temp)*0.9)]
temp_test = temp.iloc[int(len(temp)*0.9):]  
temp_train.to_json('/mnt/hdd1/LaMDa/str_data_train.json', lines=True, orient='records')
temp_valid.to_json('/mnt/hdd1/LaMDa/str_data_valid.json', lines=True, orient='records')
temp_test.to_json('/mnt/hdd1/LaMDa/str_data_test.json', lines=True, orient='records')

In [11]:
comp_str_data = {f'ending{i}': [] for i in range(len(df['comp_struc_question_list'][0]))}
comp_str_data['sent1'] = []
comp_str_data['sent2'] = []
comp_str_data['label'] = []

for i, row in df.iterrows():
    comp_str_data['sent1'].append('Which of the following statements is most correct regarding the given material?')
    comp_str_data['sent2'].append(str(row['atoms']))
    comp_str_data['label'].append(0)
    for j in range(len(row['comp_struc_question_list'])):
        comp_str_data[f'ending{j}'].append(row['comp_struc_question_list'][j])

In [12]:
temp = pd.DataFrame(comp_str_data)
temp_train = temp.iloc[:int(len(temp)*0.8)]
temp_valid = temp.iloc[int(len(temp)*0.8):int(len(temp)*0.9)]
temp_test = temp.iloc[int(len(temp)*0.9):]
temp_train.to_json('/mnt/hdd1/LaMDa/comp_str_data_train.json', lines=True, orient='records')
temp_valid.to_json('/mnt/hdd1/LaMDa/comp_str_data_valid.json', lines=True, orient='records')
temp_test.to_json('/mnt/hdd1/LaMDa/comp_str_data_test.json', lines=True, orient='records')

In [3]:
oxide_data = {f'ending{i}': [] for i in range(len(df['oxide_question_list'][0]))}
oxide_data['sent1'] = []
oxide_data['sent2'] = []
oxide_data['label'] = []

for i, row in df.iterrows():
    oxide_data['sent1'].append('Which of the following statements is most correct regarding the given material?')
    oxide_data['sent2'].append(str(row['atoms']))
    oxide_data['label'].append(0)
    for j in range(len(row['oxide_question_list'])):
        oxide_data[f'ending{j}'].append(row['oxide_question_list'][j])

In [4]:
temp = pd.DataFrame(oxide_data)
temp_train = temp.iloc[:int(len(temp)*0.8)]
temp_valid = temp.iloc[int(len(temp)*0.8):int(len(temp)*0.9)]
temp_test = temp.iloc[int(len(temp)*0.9):]
temp_train.to_json('/mnt/hdd1/LaMDa/oxide_data_train.json', lines=True, orient='records')
temp_valid.to_json('/mnt/hdd1/LaMDa/oxide_data_valid.json', lines=True, orient='records')
temp_test.to_json('/mnt/hdd1/LaMDa/oxide_data_test.json', lines=True, orient='records')