In [9]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import BertTokenizerFast
from transformers import BertLMHeadModel
from transformers import get_scheduler
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from dataclasses import dataclass

from dotenv import load_dotenv
load_dotenv("../.env")


True

In [10]:
dataset=load_dataset('ur-whitelab/mapi', token=os.environ['HF_TOKEN'])

print(dataset['train'].column_names)

target = ["band_gap"]
features=['nsites', 'nelements', 'formula_pretty', 'chemsys', 'volume', 'density', 'density_atomic', 'crystal_system', 'symbol', 'number', 'point_group', 'structure']




In [11]:
print(dataset['train'][2]['structure'])

Full Formula (Nb1 V2 Mo1)
Reduced Formula: NbV2Mo
abc   :  10.220753  10.220753  10.220753
angles: 128.933454 117.899846  84.471274
pbc   :       True       True       True
Sites (4)
  #  SP      a         b         c    magmom
---  ----  ---  --------  --------  --------
  0  Nb      0  0         0            0.516
  1  V       0  0.251541  0.251541     1.349
  2  V       0  0.748459  0.748459     1.349
  3  Mo      0  0.5       0.5          1.103


In [12]:
train_dataset = dataset['train'].select_columns(features+target).to_pandas()
test_dataset = dataset['test'].select_columns(features+target).to_pandas()

train_dataset = train_dataset.dropna(subset=target, axis=0)
test_dataset = test_dataset.dropna(subset=target, axis=0)

formula = train_dataset.iloc[0]['formula_pretty']
all_elements = "-".join(train_dataset['chemsys'].to_list())
all_elements = set(all_elements.split("-"))
elements_dics = {k: v for (v, k) in enumerate(all_elements, 1)}

voc = 	list(all_elements) + [str(i) for i in range(10)] + ["Full Formula", "Reduced Formula", "abc", "angles", "pbc", "Sites", "True", "False", "magmom", ".", "(", ")", "-", "\n", " ", "a", "b", "c"]

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
tokenizer.add_tokens(voc)

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["structure"], padding="max_length", truncation=True)

def filter_none(example):
    return all(value is not None for value in example.values())

dataset = dataset.select_columns(['structure']+target)
dataset = dataset.filter(filter_none)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["structure"])
tokenized_datasets = tokenized_datasets.rename_column("band_gap", "labels")
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

Map: 100%|██████████| 124283/124283 [00:55<00:00, 2227.11 examples/s]
Map: 100%|██████████| 31070/31070 [00:10<00:00, 2857.59 examples/s]


In [16]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

## Training using PyTorch

In [25]:
model = BertLMHeadModel.from_pretrained("google-bert/bert-base-cased", is_decoder=True)

model(batch['input_ids'], batch['attention_mask']).logits.shape

: 

In [18]:
from tqdm.auto import tqdm
from transformers import BertForTokenClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertLMHeadModel.from_pretrained("google-bert/bert-base-cased", is_decoder=True)
model = BertForTokenClassification.from_pretrained("google-bert/bert-base-cased", num_labels=1)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/375 [00:51<?, ?it/s]


ValueError: Expected input batch_size (4096) to match target batch_size (8).

In [44]:
model(batch['input_ids'])

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

## Training using HuggingFace API

In [None]:
# HuggingFace training framework. Check this later

from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

## Evaluation

In [None]:

losses = []

for _ in range(10):
  print(f"Starting epoch {_}.")
  model.train()
  for batch, d in enumerate(train_dataloader):
    optimizer.zero_grad()
    size = len(train_dataloader.dataset)
    
    X = tokenizer(d['structure'], padding="max_length", truncation=True, return_tensors='pt')

    X = X['input_ids'].to(device)
    y = d['band_gap'].to(device)
    
    pred = model(X)
    loss = loss_fn(torch.flatten(pred), y.to(torch.float32))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if batch % 500 == 0:
      losses.append(loss.item())
      loss_item = loss.item()
      current = batch * len(X) + len(X)
      print(f"\tloss: {loss_item:>7f}  [{current:>5d}/{size:>5d}]")
  print(f"Epoch {_} done.")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
import matplotlib as mpl
import matplotlib.font_manager as font_manager
urllib.request.urlretrieve('https://github.com/google/fonts/raw/main/ofl/ibmplexmono/IBMPlexMono-Regular.ttf', 'IBMPlexMono-Regular.ttf')
fe = font_manager.FontEntry(
    fname='IBMPlexMono-Regular.ttf',
    name='plexmono')
font_manager.fontManager.ttflist.append(fe)
plt.rcParams.update({'axes.facecolor':'#f5f4e9',
            'grid.color' : '#AAAAAA',
            'axes.edgecolor':'#333333',
            'figure.facecolor':'#FFFFFF',
            'axes.grid': False,
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': fe.name,
            'figure.figsize': (3.5,3.5 / 1.2),
            'ytick.left': True,
            'xtick.bottom': True
           })

In [2]:
k=8

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

yhat=[]
y=[]
for k in range(100):
    try:
        y.append(test_dataset.iloc[k][target])
        yhat.append(model.predict([test_dataset.iloc[k][features]]))
    except Exception as e:
        print(k, e)

y = np.array(y).flatten()
yhat = np.array(yhat).flatten()
print(y.shape, yhat.shape)

0 name 'test_dataset' is not defined
1 name 'test_dataset' is not defined
2 name 'test_dataset' is not defined
3 name 'test_dataset' is not defined
4 name 'test_dataset' is not defined
5 name 'test_dataset' is not defined
6 name 'test_dataset' is not defined
7 name 'test_dataset' is not defined
8 name 'test_dataset' is not defined
9 name 'test_dataset' is not defined
10 name 'test_dataset' is not defined
11 name 'test_dataset' is not defined
12 name 'test_dataset' is not defined
13 name 'test_dataset' is not defined
14 name 'test_dataset' is not defined
15 name 'test_dataset' is not defined
16 name 'test_dataset' is not defined
17 name 'test_dataset' is not defined
18 name 'test_dataset' is not defined
19 name 'test_dataset' is not defined
20 name 'test_dataset' is not defined
21 name 'test_dataset' is not defined
22 name 'test_dataset' is not defined
23 name 'test_dataset' is not defined
24 name 'test_dataset' is not defined
25 name 'test_dataset' is not defined
26 name 'test_dataset'

In [None]:
lim = (min(y),max(y))
plt.xlabel('True')
plt.ylabel('Predicted')
plt.plot(y, yhat, 'o', alpha=0.2)
plt.plot(lim, lim, '--')
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 1*0.1*(max(y)-min(y)), f"correlation = {np.corrcoef(y, yhat)[0,1]:.3f}")
plt.text(lim[0] + 0.1*(max(y)-min(y)), lim[1] - 2*0.1*(max(y)-min(y)), f"MAE = {mean_squared_error(y, yhat):.3f}")
plt.show()
