In [1]:

!pip install pytorch-lightning
!pip install transformers --quiet
#!pip install torchmetrics
!pip install "torchmetrics<0.7"
!pip install scikit-multilearn
!pip install pandas
!pip install sklearn
!pip install seaborn
#!pip uninstall sagemaker -y
!pip install sagemaker botocore==1.23.23
#!pip uninstall boto3 botocore -y
#!pip install --no-cache-dir sagemaker==1.72.1 botocore==1.23.23
#!pip install boto3 botocore
#!nvidia-smi

import pandas as pd
import numpy as np


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification

import pytorch_lightning as pl
from torchmetrics.functional import accuracy, f1, auroc, precision_recall_curve, average_precision
#from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve, auc

#bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

pl.seed_everything(RANDOM_SEED)






Global seed set to 42


42

In [2]:
# importing the LIME libraries
import lime
import sklearn.ensemble
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

In [3]:
# Read in the cleaned data
data_prefix = 'https://raw.githubusercontent.com/nasa-petal/search-engine/main/data/'
df = pd.read_csv(data_prefix + 'cleaned_leaves.csv')

# Drop all non-feature columns
non_feat = ['y', 'text']
df.drop(non_feat, axis=1, inplace=True)

# Drop all labels with < 20 papers
LABEL_COLUMNS = df.columns.tolist()[:-1]
df.drop([col for col, val in df[LABEL_COLUMNS].sum().iteritems() if val < 25], axis=1, inplace=True)
dropcols = ['protect_from_animals', 'coordinate_by_self-organization', 'maintain_biodiversity', 'compete_within/between_species', 'cooperate_within/between_species']
df.drop(dropcols, axis=1, inplace=True)

#df = df[df.columns[df[LABEL_COLUMNS].sum()>3]]
print(df.shape)
df.head()
#start with one item (pick one of the classes to explain)
# make configurable, i.e. tell lime which class you want it to predict
#what words differentiate a class


(11012, 30)


Unnamed: 0,distribute_liquids,sense_light_in_the_visible_spectrum,optimize_shape/materials,sense_chemicals,manage_stress/strain,actively_move_through/on_liquids,manage_shear,chemically_assemble_organic_compounds,change_size/shape,attach_temporarily,...,manage_wear,respond_to_signals,protect_from_temperature,physically_assemble_structure,prevent_fracture/rupture,protect_from_microbes,manage_impact,protect_from_excess_liquids,actively_move_through/on_solids,text_raw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,Building a home from foam—túngara frog foam ne...
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"A nocturnal mammal, the greater mouse-eared ba..."
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Polarization sensitivity in two species of cut...
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Identification and characterization of a multi...
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,DIFFERENCES IN POLYSACCHARIDE STRUCTURE BETWEE...


In [4]:

LABEL_COLUMNS = df.columns.tolist()[:-1]

biom = df[df[LABEL_COLUMNS].sum(axis=1) > 0]
nonbiom = df[df[LABEL_COLUMNS].sum(axis=1) == 0]

# remove all non-biomimicry papers from dataset.
df = biom

In [5]:
from skmultilearn.model_selection import iterative_train_test_split

def iterative_train_test_split_dataframe(X, y, test_size):
    df_index = np.expand_dims(X.index.to_numpy(), axis=1)
    df_index_y = np.expand_dims(y.index.to_numpy(), axis=1)
    X_train, y_train, X_test, y_test = iterative_train_test_split(df_index, df_index_y, test_size = test_size)
    X_train = X.loc[X_train[:,0]]
    X_test = X.loc[X_test[:,0]]
    y_train = y.loc[y_train[:,0]]
    y_test = y.loc[y_test[:,0]]
    return X_train, y_train, X_test, y_test


X_train, y_train, X_test, y_test = iterative_train_test_split_dataframe(X=df[['text_raw']], y=df[LABEL_COLUMNS], test_size = 0.15)
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_test, y_test], axis=1)
print(train_df.shape, val_df.shape)

'''
X_train_val, y_train_val, X_test, y_test = iterative_train_test_split_dataframe(X=df[['text_raw']], y=df[LABEL_COLUMNS], test_size = 0.1)
test_df = pd.concat([X_test, y_test], axis=1)
X_train, y_train, X_val, y_val = iterative_train_test_split_dataframe(X=X_train_val, y=y_train_val, test_size = 0.13)
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
#train_df, val_df = train_test_split(df, test_size=0.1)
train_df.shape, val_df.shape, test_df.shape
'''

(643, 30) (114, 30)


"\nX_train_val, y_train_val, X_test, y_test = iterative_train_test_split_dataframe(X=df[['text_raw']], y=df[LABEL_COLUMNS], test_size = 0.1)\ntest_df = pd.concat([X_test, y_test], axis=1)\nX_train, y_train, X_val, y_val = iterative_train_test_split_dataframe(X=X_train_val, y=y_train_val, test_size = 0.13)\ntrain_df = pd.concat([X_train, y_train], axis=1)\nval_df = pd.concat([X_val, y_val], axis=1)\n#train_df, val_df = train_test_split(df, test_size=0.1)\ntrain_df.shape, val_df.shape, test_df.shape\n"

In [6]:
BERT_MODEL_NAME = 'allenai/scibert_scivocab_uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [7]:
MAX_TOKEN_COUNT = 512

In [8]:
class BiomimicryDataset(Dataset):

  def __init__(
    self, 
    data: pd.DataFrame, 
    tokenizer: BertTokenizer, 
    max_token_len: int = 128
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len
    
  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    text = data_row.text_raw
    labels = data_row[LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      text=text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),#masks input ids
      labels=torch.FloatTensor(labels)
    )

In [9]:
class BiomimicryDataModule(pl.LightningDataModule):

  def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=128):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def setup(self, stage=None):
    self.train_dataset = BiomimicryDataset(
      self.train_df,
      self.tokenizer,
      self.max_token_len
    )

    self.test_dataset = BiomimicryDataset(
      self.test_df,
      self.tokenizer,
      self.max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
      self.train_dataset,
      batch_size=self.batch_size,
      shuffle=True,
      num_workers=2
    )

  def val_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=2
    )

  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=2
    )

In [10]:
N_EPOCHS = 100
BATCH_SIZE = 12 # smaller is better.

data_module = BiomimicryDataModule(
  train_df,
  val_df,
  tokenizer,
  batch_size=BATCH_SIZE,
  max_token_len=MAX_TOKEN_COUNT
)

In [11]:
class BiomimicryTagger(pl.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)    
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def training_epoch_end(self, outputs):
    
    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i],  labels[:, i])
      #class_ap = average_precision(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
      #self.logger.experiment.add_scalar(f"{name}_ap/Train", class_ap, self.current_epoch)


  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

In [12]:
dummy_model = nn.Linear(2, 1)

optimizer = AdamW(params=dummy_model.parameters(), lr=0.001)

warmup_steps = 20
total_training_steps = 100

scheduler = get_linear_schedule_with_warmup(
  optimizer, 
  num_warmup_steps=warmup_steps,
  num_training_steps=total_training_steps
)

learning_rate_history = []

for step in range(total_training_steps):
  optimizer.step()
  scheduler.step()
  learning_rate_history.append(optimizer.param_groups[0]['lr'])



In [13]:
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

In [14]:
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(1060, 5300)

In [15]:
model = BiomimicryTagger(
  n_classes=len(LABEL_COLUMNS),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps 
)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
!rm -rf lightning_logs/
!rm -rf checkpoints/

In [17]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1, # save top 5 or 10.
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [18]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10)

In [19]:
# trainer = pl.Trainer(
#   checkpoint_callback=checkpoint_callback,
#   callbacks=[early_stopping_callback],
#   max_epochs=N_EPOCHS,
#   gpus=1,
#   progress_bar_refresh_rate=30
# )

In [20]:
# trainer.fit(model, data_module)

In [21]:
#test = trainer.checkpoint_callback.best_model_path
#if not test:
test = 'scibert-top29-aws-epoch46-122221.ckpt'

trained_model = BiomimicryTagger.load_from_checkpoint(
  test,
  n_classes=len(LABEL_COLUMNS)
)
trained_model.eval()
trained_model.freeze()

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
trained_model

BiomimicryTagger(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [23]:
import lime
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device) #creates new instance of biomimicry dataset (created in above cells)

model = trained_model
class_names = LABEL_COLUMNS
#class_names = ['positive','negative', 'neutral']

def predictor(texts): 
    #texts is a list of strings that needs to be converted to a pandas dataframe that is of the same format of val_df where each string goes into the text_raw column and the value of all the label_columns is =0 bc we dont know what the label is

    texts = pd.DataFrame(texts, columns=['text_raw'])
    texts[LABEL_COLUMNS] = 0
    #display(texts)


    val_dataset = BiomimicryDataset(
        texts,
        tokenizer,
        max_token_len=MAX_TOKEN_COUNT
)
    predictions = []
    labels = []
    for item in val_dataset:
        _, prediction = trained_model(
        item["input_ids"].unsqueeze(dim=0).to(device),
        item["attention_mask"].unsqueeze(dim=0).to(device)
)
    predictions.append(prediction.flatten())
    labels.append(item["labels"].int())
    predictions = torch.stack(predictions).detach().cpu().numpy()

#labels = torch.stack(labels).detach().cpu()
    return predictions

explainer = LimeTextExplainer(class_names=class_names)

str_to_predict = "Bioinspired honeycomb."
#predictor([str_to_predict])

#exp = explainer.explain_instance(str_to_predict, predictor, num_features=20, num_samples=2000)
exp = explainer.explain_instance(str_to_predict, predictor, num_features=5, top_labels=2) #we have 29 labels, will only show expl for 2
exp.show_in_notebook(text=str_to_predict)



In [None]:
import lime
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device) #creates new instance of biomimicry dataset (created in above cells)

model = trained_model
class_names = LABEL_COLUMNS
#class_names = ['positive','negative', 'neutral']

def predictor(texts): 
    #texts is a list of strings that needs to be converted to a pandas dataframe that is of the same format of val_df where each string goes into the text_raw column and the value of all the label_columns is =0 bc we dont know what the label is

    #texts = pd.DataFrame(texts)
    #texts.columns = ['text_raw']
    #result = pd.merge(texts,biom, how='inner', on=["text_raw", "text_raw"])
    #val_df = pd.concat([X_test, y_test], axis=1)
    #texts = ['text_raw']
    #df2 = df.to_string(index=False)
    #texts = pd.DataFrame(np.array(texts).reshape(1,1),columns=list())

    #print(df)
    #outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
    #probas = F.softmax(outputs.logits).detach().numpy()
    #print(df)
    #display(texts)
    #print(outputs)
    #return probas

#explainer = LimeTextExplainer(class_names=class_names)

str_to_predict = "surprising increase in revenue"
predictor([str_to_predict])

#exp = explainer.explain_instance(str_to_predict, predictor, num_features=20, num_samples=2000)
#exp.show_in_notebook(text=str_to_predict)

#looks like the right columns, now you just need to put each string in the list as a new row in the dataframe.

Unnamed: 0,text_raw
0,surprising increase in revenue


In [None]:
import lime
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device) #creates new instance of biomimicry dataset (created in above cells)

model = trained_model
class_names = LABEL_COLUMNS
new_LABEL_COLUMNS = ' '.join(class_names)
target_lst = new_LABEL_COLUMNS

 # create a function that returns the labels for each topic
def get_labels (class_names, inputs_lst):
    label_row = [0 for i in class_names]
    for i in range(len(class_names)):
        #for j in range(len(inputs)):
            if class_names[i] in inputs_lst:
                label_row[i] = 1
    return label_row
    
def convert(string):
    li = list(string.split(" "))
    return li

#driver code
str1 = "surprising increase in revenue"
print(convert(str1))
inputs = convert(str1)
#print(inputs)

def predictor():
    inputs_lst = inputs

    #prints label_table as a string, instead of a list
    label_table = ([get_labels(target_lst,i) for i in inputs_lst])
    label = ["".join([str(c) for c in lst]) for lst in label_table]
    print(label)

    df4 = pd.DataFrame(label)
    #print(df4)

    #texts is a list of strings that needs to be converted to a pandas dataframe that is of the same format of val_df where each string goes into the text_raw column and the value of all the label_columns is =0 bc we dont know what the label is

    #texts = pd.DataFrame(label)
    ##print(texts)
    #texts.columns = ['text_raw']
    #result = pd.merge(texts,biom, how='inner', on=["text_raw", "text_raw"])
    #val_df = pd.concat([X_test, y_test], axis=1)
    #texts = ['text_raw']
    #df2 = df.to_string(index=False)
    #texts = pd.DataFrame(np.array(texts).reshape(1,1),columns=list())

    #print(df)
    #outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
    #probas = F.softmax(outputs.logits).detach().numpy()
    #print(df)
    #display(result)
    #print(outputs)
    #return probas

#explainer = LimeTextExplainer(class_names=class_names)

#str_to_predict = "surprising increase in revenue"
#predictor([str_to_predict])

#exp = explainer.explain_instance(str_to_predict, predictor, num_features=20, num_samples=2000)
#exp.show_in_notebook(text=str_to_predict)

#looks like the right columns, now you just need to put each string in the list as a new row in the dataframe.

['surprising', 'increase', 'in', 'revenue']


In [None]:
print(predictor())

['011011010000101101010110001100011000000111000011000110001010100010010000001100101011000000100010001010010101101010110000100000000000010110001001011010001010010001000001000000011000000011011000001011010000110011000100100000000000010101100011000000010000000100010101100000101100100101001011000000000100000001000000010011011000001011010101000000100001111001000000001010010100001100000001000001100000101101000010000000000001011001010101010001100011110010110000001110000110001100011011010001001010101110000000000001011000100101101000101000001010110100000111100101100000001000000101001100100110000000110000001011001100110001000100011001110110011000000010000101000100010100101000011000000010000000110010110100001000000000000101100010100101', '0110110001001001010111110010000110001001110010101101000000101010101010010111101011111010101110100111010101111010111101101010000001000100000010010010100111010101110101011100001111000100101111010000010101011010110101010101001100010001111000010011000100010101110101

In [None]:
target_lst

'distribute_liquids sense_light_in_the_visible_spectrum optimize_shape/materials sense_chemicals manage_stress/strain actively_move_through/on_liquids manage_shear chemically_assemble_organic_compounds change_size/shape attach_temporarily protect_from_chemicals regulate_reproduction_or_growth chemically_break_down_organic_compounds send_chemical_signals attach_permanently protect_from_loss_of_liquids actively_move_through_gases send_light_signals_in_the_visible_spectrum distribute_solids passively_move_through/on_liquids manage_wear respond_to_signals protect_from_temperature physically_assemble_structure prevent_fracture/rupture protect_from_microbes manage_impact protect_from_excess_liquids actively_move_through/on_solids'

In [None]:

distribute_liquids	sense_light_in_the_visible_spectrum	optimize_shape/materials	sense_chemicals	manage_stress/strain	actively_move_through/on_liquids	manage_shear	chemically_assemble_organic_compounds	change_size/shape	attach_temporarily	...	manage_wear	respond_to_signals	protect_from_temperature	physically_assemble_structure	prevent_fracture/rupture	protect_from_microbes	manage_impact	protect_from_excess_liquids	actively_move_through/on_solids	text_raw
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	1	0	1	0	0	0	Building a home from foam—túngara frog foam ne...
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	A nocturnal mammal, the greater mouse-eared ba...
2	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	Polarization sensitivity in two species of cut...
3	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	Identification and characterization of a multi...
4	0	0	0	0	1	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	DIFFERENCES IN POLYSACCHARIDE STRUCTURE BETWEE...
5 rows × 30 columns

ValueError: too many values to unpack (expected 2)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [166], in <cell line: 9>()
      6 from sklearn.decomposition import PCA
      8 estimators = dict(reduce_dim=['passthrough', PCA()],clf= [SVC()])
----> 9 pipe = Pipeline(estimators)
     11 c =make_pipeline(tfidf_vectorizer, model_lol)
     13 # saving a list of strings version of the X_test object

File ~/opt/anaconda3/envs/env_pytorch/lib/python3.8/site-packages/sklearn/pipeline.py:148, in Pipeline.__init__(self, steps, memory, verbose)
    146 self.memory = memory
    147 self.verbose = verbose
--> 148 self._validate_steps()

File ~/opt/anaconda3/envs/env_pytorch/lib/python3.8/site-packages/sklearn/pipeline.py:192, in Pipeline._validate_steps(self)
    191 def _validate_steps(self):
--> 192     names, estimators = zip(*self.steps)
    194     # validate names
    195     self._validate_names(names)

ValueError: too many values to unpack (expected 2)
text_raw	distribute_liquids	sense_light_in_the_visible_spectrum	optimize_shape/materials	sense_chemicals	manage_stress/strain	actively_move_through/on_liquids	manage_shear	chemically_assemble_organic_compounds	change_size/shape	...	passively_move_through/on_liquids	manage_wear	respond_to_signals	protect_from_temperature	physically_assemble_structure	prevent_fracture/rupture	protect_from_microbes	manage_impact	protect_from_excess_liquids	actively_move_through/on_solids
0 rows × 30 columns

distribute_liquids	sense_light_in_the_visible_spectrum	optimize_shape/materials	sense_chemicals	manage_stress/strain	actively_move_through/on_liquids	manage_shear	chemically_assemble_organic_compounds	change_size/shape	attach_temporarily	...	manage_wear	respond_to_signals	protect_from_temperature	physically_assemble_structure	prevent_fracture/rupture	protect_from_microbes	manage_impact	protect_from_excess_liquids	actively_move_through/on_solids	text_raw
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	1	0	1	0	0	0	Building a home from foam—túngara frog foam ne...
2	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	Polarization sensitivity in two species of cut...
3	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	Identification and characterization of a multi...
4	0	0	0	0	1	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	DIFFERENCES IN POLYSACCHARIDE STRUCTURE BETWEE...
5	0	0	0	0	0	0	0	0	0	0	...	0	0	1	1	0	1	1	0	0	Foam nest components of the túngara frog: a co...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1056	0	0	0	0	0	0	1	0	0	0	...	1	0	0	0	0	0	0	0	0	Hardness in arthropod exoskeletons in the abse...
1057	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	1	A Biological Screw in a Beetle’s Leg. Joints o...
1058	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	Growth, geometry, and mechanics of a blooming ...
1059	0	0	0	1	0	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	Frequency specificity of vibration dependent d...
1061	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	Sag-Mediated Modulated Tension in Terebellid T...
757 rows × 30 columns

text_raw	distribute_liquids	sense_light_in_the_visible_spectrum	optimize_shape/materials	sense_chemicals	manage_stress/strain	actively_move_through/on_liquids	manage_shear	chemically_assemble_organic_compounds	change_size/shape	...	passively_move_through/on_liquids	manage_wear	respond_to_signals	protect_from_temperature	physically_assemble_structure	prevent_fracture/rupture	protect_from_microbes	manage_impact	protect_from_excess_liquids	actively_move_through/on_solids
0	Building a home from foam—túngara frog foam ne...	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	1	0	0	0
3	Identification and characterization of a multi...	0	0	0	0	0	0	0	0	0	...	0	0	0	1	0	0	0	0	0	0
5	Foam nest components of the túngara frog: a co...	0	0	0	0	0	0	0	0	0	...	0	0	0	1	1	0	1	1	0	0
8	A Multi-enzyme Cascade of Hemoglobin Proteolys...	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
10	On the buoyancy of the pearly nautilus. Nautil...	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
323	Optical properties of the iridescent organ of ...	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
325	Iridescence: a functional perspective. In anim...	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
328	Squalamine as a broad-spectrum systemic antivi...	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0
333	Architecture of the wood-wide web: Rhizopogon ...	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
335	COMPOSITION OF EXTRACELLULAR POLYMERIC SUBSTAN...	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
114 rows × 30 columns

distribute_liquids	sense_light_in_the_visible_spectrum	optimize_shape/materials	sense_chemicals	manage_stress/strain	actively_move_through/on_liquids	manage_shear	chemically_assemble_organic_compounds	change_size/shape	attach_temporarily	...	manage_wear	respond_to_signals	protect_from_temperature	physically_assemble_structure	prevent_fracture/rupture	protect_from_microbes	manage_impact	protect_from_excess_liquids	actively_move_through/on_solids	text_raw
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	1	0	1	0	0	0	Building a home from foam—túngara frog foam ne...
2	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	Polarization sensitivity in two species of cut...
3	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	Identification and characterization of a multi...
4	0	0	0	0	1	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	DIFFERENCES IN POLYSACCHARIDE STRUCTURE BETWEE...
5	0	0	0	0	0	0	0	0	0	0	...	0	0	1	1	0	1	1	0	0	Foam nest components of the túngara frog: a co...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1056	0	0	0	0	0	0	1	0	0	0	...	1	0	0	0	0	0	0	0	0	Hardness in arthropod exoskeletons in the abse...
1057	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	1	A Biological Screw in a Beetle’s Leg. Joints o...
1058	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	Growth, geometry, and mechanics of a blooming ...
1059	0	0	0	1	0	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	Frequency specificity of vibration dependent d...
1061	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	Sag-Mediated Modulated Tension in Terebellid T...
757 rows × 30 columns

text_raw	distribute_liquids	sense_light_in_the_visible_spectrum	optimize_shape/materials	sense_chemicals	manage_stress/strain	actively_move_through/on_liquids	manage_shear	chemically_assemble_organic_compounds	change_size/shape	...	passively_move_through/on_liquids	manage_wear	respond_to_signals	protect_from_temperature	physically_assemble_structure	prevent_fracture/rupture	protect_from_microbes	manage_impact	protect_from_excess_liquids	actively_move_through/on_solids
0 rows × 30 columns

text_raw	distribute_liquids	sense_light_in_the_visible_spectrum	optimize_shape/materials	sense_chemicals	manage_stress/strain

SyntaxError: invalid syntax (635033987.py, line 1)

In [None]:
hi = predictor([str_to_predict])
#hi.head()


Unnamed: 0,text_raw,distribute_liquids,sense_light_in_the_visible_spectrum,optimize_shape/materials,sense_chemicals,manage_stress/strain,actively_move_through/on_liquids,manage_shear,chemically_assemble_organic_compounds,change_size/shape,...,passively_move_through/on_liquids,manage_wear,respond_to_signals,protect_from_temperature,physically_assemble_structure,prevent_fracture/rupture,protect_from_microbes,manage_impact,protect_from_excess_liquids,actively_move_through/on_solids


In [None]:
df

Unnamed: 0,distribute_liquids,sense_light_in_the_visible_spectrum,optimize_shape/materials,sense_chemicals,manage_stress/strain,actively_move_through/on_liquids,manage_shear,chemically_assemble_organic_compounds,change_size/shape,attach_temporarily,...,manage_wear,respond_to_signals,protect_from_temperature,physically_assemble_structure,prevent_fracture/rupture,protect_from_microbes,manage_impact,protect_from_excess_liquids,actively_move_through/on_solids,text_raw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,Building a home from foam—túngara frog foam ne...
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Polarization sensitivity in two species of cut...
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Identification and characterization of a multi...
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,DIFFERENCES IN POLYSACCHARIDE STRUCTURE BETWEE...
5,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,0,0,Foam nest components of the túngara frog: a co...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,Hardness in arthropod exoskeletons in the abse...
1057,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,A Biological Screw in a Beetle’s Leg. Joints o...
1058,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"Growth, geometry, and mechanics of a blooming ..."
1059,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Frequency specificity of vibration dependent d...


In [None]:
val_df

Unnamed: 0,text_raw,distribute_liquids,sense_light_in_the_visible_spectrum,optimize_shape/materials,sense_chemicals,manage_stress/strain,actively_move_through/on_liquids,manage_shear,chemically_assemble_organic_compounds,change_size/shape,...,passively_move_through/on_liquids,manage_wear,respond_to_signals,protect_from_temperature,physically_assemble_structure,prevent_fracture/rupture,protect_from_microbes,manage_impact,protect_from_excess_liquids,actively_move_through/on_solids
0,Building a home from foam—túngara frog foam ne...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,Identification and characterization of a multi...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,Foam nest components of the túngara frog: a co...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,1,0,0
8,A Multi-enzyme Cascade of Hemoglobin Proteolys...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,On the buoyancy of the pearly nautilus. Nautil...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,Optical properties of the iridescent organ of ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
325,Iridescence: a functional perspective. In anim...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
328,Squalamine as a broad-spectrum systemic antivi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
333,Architecture of the wood-wide web: Rhizopogon ...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#model
trained_model

BiomimicryTagger(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device) #creates new instance of biomimicry dataset (created in above cells)

val_dataset = BiomimicryDataset(
  val_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []
labels = []

for item in val_dataset:
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device), 
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu() #.stack, .detach, .cpu find material in pytorch or pytorch lightning documentation
labels = torch.stack(labels).detach().cpu()
#generates a prediction

In [None]:
THRESHOLD = 0.10
accuracy(predictions, labels, threshold=THRESHOLD)

tensor(0.9486)

In [None]:
model

BiomimicryTagger(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [None]:
BERT_MODEL_NAME

'allenai/scibert_scivocab_uncased'

In [None]:
target_names = LABEL_COLUMNS

In [None]:
#from huggingface_hub import hf_hub_download
#hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")


In [None]:
#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

#model
#model_lol=BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=len(target_names))

# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# with torch.no_grad():
#     logits = model_lol(**inputs).logits
# num_labels = len(model_lol.config.id2label)
# predicted_class_id = logits.argmax().item()
# labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(torch.float)

# loss = model_lol(**inputs, labels=labels).loss

# loss.backward()


#tf.keras.model_lol.fit(X_train_vectors_tfidf, y_train) 
#Predict y value for test dataset
#y_pred = model_lol.predict(X_test_vectors_tfidf)
#y_prob = model_lol.predict_proba(X_test_vectors_tfidf)[:,1]
#print(classification_report(y_val,y_pred))
#print('Confusion Matrix:',confusion_matrix(y_val, y_pred))

#fpr, tpr, thresholds = roc_curve(y_test, y_prob)
#roc_auc = auc(fpr, tpr)
#print('AUC:', roc_auc)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [None]:
from sklearn.pipeline import make_pipeline, Pipeline
#LIME installation
# converting the vectoriser and model into a pipeline
# this is necessary as LIME takes a model pipeline as an input
from sklearn.svm import SVC
from sklearn.decomposition import PCA

estimators = dict(reduce_dim=['passthrough', PCA()],clf= [SVC()])
pipe = Pipeline(estimators)

c =make_pipeline(tfidf_vectorizer, model_lol)

# saving a list of strings version of the X_test object
ls_X_test= list(X_test)
# saving the class names in a dictionary to increase interpretability
class_names = {0: 'non-manage shear', 1:'manage shear'}

# create the LIME explainer
# add the class names for interpretability
LIME_explainer = LimeTextExplainer(class_names=class_names)

# choose a random single prediction
idx = 15
# explain the chosen prediction 
# use the probability results of the logistic regression
# can also add num_features parameter to reduce the number of features explained
LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predicted_class_id)

ValueError: too many values to unpack (expected 2)