In [2]:
# importing library
import pandas as pd
import matplotlib.pyplot as plt
import string
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

In [3]:
# downloading the nltk data for preprocessing
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dolungwe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dolungwe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#read in the cleaned data
data_prefix = 'https://raw.githubusercontent.com/nasa-petal/search-engine/main/data/'
df = pd.read_csv(data_prefix + 'cleaned_leaves.csv')

# Drop all non-feature columns
non_feat = ['y', 'text']
df.drop(non_feat, axis=1, inplace=True)

# Drop all labels with < 20 papers
LABEL_COLUMNS = df.columns.tolist()[:-1]
df.drop([col for col, val in df[LABEL_COLUMNS].sum().iteritems() if val < 25], axis=1, inplace=True)
dropcols = ['protect_from_animals', 'coordinate_by_self-organization', 'maintain_biodiversity', 'compete_within/between_species', 'cooperate_within/between_species']
df.drop(dropcols, axis=1, inplace=True)

#df = df[df.columns[df[LABEL_COLUMNS].sum()>3]]
print(df.shape)
df.head()

(11012, 30)


Unnamed: 0,distribute_liquids,sense_light_in_the_visible_spectrum,optimize_shape/materials,sense_chemicals,manage_stress/strain,actively_move_through/on_liquids,manage_shear,chemically_assemble_organic_compounds,change_size/shape,attach_temporarily,...,manage_wear,respond_to_signals,protect_from_temperature,physically_assemble_structure,prevent_fracture/rupture,protect_from_microbes,manage_impact,protect_from_excess_liquids,actively_move_through/on_solids,text_raw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,Building a home from foam—túngara frog foam ne...
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"A nocturnal mammal, the greater mouse-eared ba..."
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Polarization sensitivity in two species of cut...
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Identification and characterization of a multi...
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,DIFFERENCES IN POLYSACCHARIDE STRUCTURE BETWEE...


In [5]:
LABEL_COLUMNS = df.columns.tolist()[:-1]

biom = df[df[LABEL_COLUMNS].sum(axis=1) > 0]
nonbiom = df[df[LABEL_COLUMNS].sum(axis=1) == 0]

# remove all non-biomimicry papers from dataset.
df = biom

In [6]:
from skmultilearn.model_selection import iterative_train_test_split

In [7]:
def iterative_train_test_split_dataframe(X, y, test_size):
    df_index = np.expand_dims(X.index.to_numpy(), axis=1)
    df_index_y = np.expand_dims(y.index.to_numpy(), axis=1)
    X_train, y_train, X_test, y_test = iterative_train_test_split(df_index, df_index_y, test_size = test_size)
    X_train = X.loc[X_train[:,0]]
    X_test = X.loc[X_test[:,0]]
    y_train = y.loc[y_train[:,0]]
    y_test = y.loc[y_test[:,0]]
    return X_train, y_train, X_test, y_test

In [8]:
X_train, y_train, X_test, y_test = iterative_train_test_split_dataframe(X=df[['text_raw']], y=df[LABEL_COLUMNS], test_size = 0.15)
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_test, y_test], axis=1)
print(train_df.shape, val_df.shape)

'''
X_train_val, y_train_val, X_test, y_test = iterative_train_test_split_dataframe(X=df[['text_raw']], y=df[LABEL_COLUMNS], test_size = 0.1)
test_df = pd.concat([X_test, y_test], axis=1)
X_train, y_train, X_val, y_val = iterative_train_test_split_dataframe(X=X_train_val, y=y_train_val, test_size = 0.13)
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
#train_df, val_df = train_test_split(df, test_size=0.1)
train_df.shape, val_df.shape, test_df.shape
'''

(644, 30) (113, 30)


"\nX_train_val, y_train_val, X_test, y_test = iterative_train_test_split_dataframe(X=df[['text_raw']], y=df[LABEL_COLUMNS], test_size = 0.1)\ntest_df = pd.concat([X_test, y_test], axis=1)\nX_train, y_train, X_val, y_val = iterative_train_test_split_dataframe(X=X_train_val, y=y_train_val, test_size = 0.13)\ntrain_df = pd.concat([X_train, y_train], axis=1)\nval_df = pd.concat([X_val, y_val], axis=1)\n#train_df, val_df = train_test_split(df, test_size=0.1)\ntrain_df.shape, val_df.shape, test_df.shape\n"

In [9]:
train_df

Unnamed: 0,text_raw,distribute_liquids,sense_light_in_the_visible_spectrum,optimize_shape/materials,sense_chemicals,manage_stress/strain,actively_move_through/on_liquids,manage_shear,chemically_assemble_organic_compounds,change_size/shape,...,passively_move_through/on_liquids,manage_wear,respond_to_signals,protect_from_temperature,physically_assemble_structure,prevent_fracture/rupture,protect_from_microbes,manage_impact,protect_from_excess_liquids,actively_move_through/on_solids
0,Building a home from foam—túngara frog foam ne...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,Polarization sensitivity in two species of cut...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,DIFFERENCES IN POLYSACCHARIDE STRUCTURE BETWEE...,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,The suctorial organ of the Solifugae (Arachnid...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,The diversity of hydrostatic skeletons. Summar...,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056,Hardness in arthropod exoskeletons in the abse...,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1057,A Biological Screw in a Beetle’s Leg. Joints o...,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1058,"Growth, geometry, and mechanics of a blooming ...",0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1059,Frequency specificity of vibration dependent d...,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [10]:
# NLP pre-processing
# remove urls, handles, and the hashtag from hashtags 
# (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
def remove_urls(text_raw):
  new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text_raw).split())
  return new_text

In [11]:
# make all text lowercase
def text_lowercase(text_raw): 
  return text_raw.lower()

In [12]:
# remove numbers
def remove_numbers(text_raw): 
  result = re.sub(r'\d+', '', text_raw) 
  return result

In [13]:
# remove punctuation
def remove_punctuation(text_raw): 
  translator = str.maketrans('', '', string.punctuation)
  return text_raw.translate(translator)

In [14]:
# function for all pre-processing steps
def preprocessing(text_raw):
  text_raw = text_lowercase(text_raw)
  text_raw = remove_urls(text_raw)
  text_raw = remove_numbers(text_raw)
  text_raw = remove_punctuation(text_raw)
  return text_raw

In [15]:
# pre-processing the text body column
pp_text = []
for text_data in train_df['text_raw']:
  # check if string
  if isinstance(text_data, str):
    pp_text_data = preprocessing(text_data)
    pp_text.append(pp_text_data)
   # if not string
  else:
    pp_text.append(np.NaN)

In [16]:
# add pre-processed column to dataset
train_df['pp_text'] = pp_text

In [17]:
train_df

Unnamed: 0,text_raw,distribute_liquids,sense_light_in_the_visible_spectrum,optimize_shape/materials,sense_chemicals,manage_stress/strain,actively_move_through/on_liquids,manage_shear,chemically_assemble_organic_compounds,change_size/shape,...,manage_wear,respond_to_signals,protect_from_temperature,physically_assemble_structure,prevent_fracture/rupture,protect_from_microbes,manage_impact,protect_from_excess_liquids,actively_move_through/on_solids,pp_text
0,Building a home from foam—túngara frog foam ne...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,building a home from foam t ngara frog foam ne...
2,Polarization sensitivity in two species of cut...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,polarization sensitivity in two species of cut...
4,DIFFERENCES IN POLYSACCHARIDE STRUCTURE BETWEE...,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,differences in polysaccharide structure betwee...
6,The suctorial organ of the Solifugae (Arachnid...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,the suctorial organ of the solifugae arachnida...
9,The diversity of hydrostatic skeletons. Summar...,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,the diversity of hydrostatic skeletons summary...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056,Hardness in arthropod exoskeletons in the abse...,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,hardness in arthropod exoskeletons in the abse...
1057,A Biological Screw in a Beetle’s Leg. Joints o...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,a biological screw in a beetle s leg joints on...
1058,"Growth, geometry, and mechanics of a blooming ...",0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,growth geometry and mechanics of a blooming li...
1059,Frequency specificity of vibration dependent d...,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,frequency specificity of vibration dependent d...


In [18]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["distribute_liquids"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["sense_light_in_the_visible_spectrum"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["optimize_shape/materials"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["sense_chemicals"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["manage_stress/strain"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["actively_move_through/on_liquids"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["manage_shear"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["chemically_assemble_organic_compounds"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["change_size/shape"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["manage_wear"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["respond_to_signals"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["protect_from_temperature"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["physically_assemble_structure"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["prevent_fracture/rupture"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["protect_from_microbes"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["manage_impact"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["protect_from_excess_liquids"])
X_train, X_test, y_train, y_test = train_test_split(train_df["pp_text"], train_df["actively_move_through/on_solids"])


In [19]:
X_train, X_test, y_train, y_test

(721    ontogenetic changes in tracheal structure faci...
 667    utilising the synergy between plants and rhizo...
 365    spider web protection through visual advertise...
 712    mutations perturbing petal cell shape and anth...
 830    self cleaning in tree frog toe pads a mechanis...
                              ...                        
 772    notch signaling in osteoblasts bone remodeling...
 967    sidewinding with minimal slip snake and robot ...
 968    cellular and molecular biology of the aquapori...
 619    from eye spots to eye shine in the marginalia ...
 623    sticking ability in spix s disk winged bat thy...
 Name: pp_text, Length: 483, dtype: object,
 252     peptide tag forming a rapid covalent bond to a...
 477     glycosylated hydroxytryptophan in a mussel adh...
 518     how kelp produce blade shapes suited to differ...
 374     carotenoids need structural colours to shine t...
 870     plant responses to bacterial quorum sensing si...
                       

In [20]:
# create bag-of-words with weights using tfid vectoriser
# strip accents and remove stop words during vectorisation
tf=TfidfVectorizer(strip_accents = 'ascii', stop_words='english')

In [21]:
# transform and fit the training set with vectoriser
X_train_tf = tf.fit_transform(X_train)
# transform the test set with vectoriser
X_test_tf = tf.transform(X_test)

In [22]:
# create logistic regression model
logreg = LogisticRegression(verbose=1, random_state=0, penalty='l2', solver='newton-cg')

In [23]:
# train model on  vectorised training data
model = logreg.fit(X_train_tf, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [24]:
# evaluate model performance on the test set
pred = model.predict(X_test_tf)
metrics.f1_score(y_test, pred, average='weighted')
#average changes each time the code is ran, but slightly

0.9078276795668098

In [25]:
# importing SHAP
import shap

# sampling data from the training and test set to reduce time-taken
X_train_sample = shap.sample(X_train_tf, 200)
X_test_sample = shap.sample(X_test_tf, 40)

# creating the KernelExplainer using the logistic regression model and training sample
SHAP_explainer = shap.KernelExplainer(model.predict, X_train_sample)
# calculating the shap values of the test sample using the explainer 
shap_vals = SHAP_explainer.shap_values(X_test_sample)

# converting the test samples to a dataframe 
# this is necessary for non-tabular data in order for the visualisations 
# to include feature value
colour_test = pd.DataFrame(X_test_sample.todense())

Using 200 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
  2%|▎         | 1/40 [28:18<18:24:02, 1698.53s/it]