In [2]:
!pip install jedi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jedi
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 7.7 MB/s 
Installing collected packages: jedi
Successfully installed jedi-0.18.1


In [3]:
!pip install sklearn-contrib-lightning



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn-contrib-lightning
  Downloading sklearn_contrib_lightning-0.6.2.post0-cp37-cp37m-manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 7.6 MB/s 
Installing collected packages: sklearn-contrib-lightning
Successfully installed sklearn-contrib-lightning-0.6.2.post0


In [11]:
import re
import pandas as pd
from tempfile import mkdtemp
from text_unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from lightning.classification import LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

RANDOM_STATE = 42
USE_VENUE = False
USE_TOKENS = False  # their tokens are leaking the label
ACCEPTABLE_CHARS = re.compile(r"[^a-zA-Z\s]+")


def normalize_text(text):
    if text is None or len(text) == 0:
        return ""

    norm_text = unidecode(text).lower()
    norm_text = ACCEPTABLE_CHARS.sub(" ", norm_text)
    norm_text = re.sub(r"\s+", " ", norm_text).strip()

    return norm_text


def concat_text(row, use_venue=USE_VENUE, use_tokens=USE_TOKENS, sep="|", sep_num=5):
    if use_tokens:
        title = " ".join(row.title_tokens)
        abstract = " ".join(row.abstract_tokens)
    else:
        title = row.title
        abstract = row.abstract

    title = normalize_text(title)
    abstract = normalize_text(abstract)

    if use_venue:
        venue = " ".join(row.venue)
        venue = normalize_text(venue)

        return f"{title} {sep * sep_num} {abstract} {sep * sep_num} {venue}"
    else:
        return f"{title} {sep * sep_num} {abstract}"


# import data
# note: we got our own titles and abstracts as there were strange issues with the
# original tokenized data that leaked the label

df = pd.read_json("golden_og.json").drop(
    ["petalID", "level1", "level2", "level3", "url"], axis=1
    # "author", "reference", "mag",
)
from numpy.random import default_rng

# arr_indices_top_drop = default_rng().choice(df.index, size=8800, replace=False)
# df = df.drop(index=arr_indices_top_drop)

df_y = df.loc[df['isBiomimicry'] == "Y"]
df_n = df.loc[df['isBiomimicry'] == "N"]

print(len(df_y))
print(len(df_n))



1430
8432


In [12]:
arr_indices_top_drop = default_rng().choice(df_y.index, size=900, replace=False)
df_y = df_y.drop(index=arr_indices_top_drop)

arr_indices_top_drop = default_rng().choice(df_n.index, size=7900, replace=False)
df_n = df_n.drop(index=arr_indices_top_drop)

df = pd.concat([df_y, df_n])

In [None]:

# df2 = pd.read_json("data/biomimicry.json")
# df = pd.merge(df, df2, how="inner", on="doi").drop_duplicates(["doi", "title"])

df.loc[:, "text"] = df.apply(concat_text, axis=1)
df.rename(columns={"title": "title_tokens", "abstract": "abstract_tokens"}, inplace=True)

df.loc[:, "y"] = (df.isBiomimicry == "Y").astype("int")

# small data, so we need to do nested cross-validation
tfidf = TfidfVectorizer(strip_accents="ascii", analyzer="char", ngram_range=(1, 5), max_features=500000)
scaler = MaxAbsScaler(copy=False)
estimator = Pipeline(
    [("tfidf", tfidf), ("scaler", scaler), ("svc", LinearSVC(loss="squared_hinge", random_state=RANDOM_STATE))],
    memory=mkdtemp(),
)
p_grid = {"svc__C": [5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1, 1e0, 5e0, 1e1, 5e1]}

inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)
outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)

# Nested CV with parameter optimization
clf = GridSearchCV(estimator=estimator, scoring="f1", param_grid=p_grid, cv=inner_cv, verbose = 2)
nested_score = cross_val_score(clf, X=df["text"], y=df["y"], scoring="f1", cv=outer_cv)
print(nested_score)

In [None]:
# result with 500: [0.08695652 0.23076923 0.15384615 0.        ]
# result with 1000: [0.35555556 0.23809524 0.04878049 0.15      ]
# result with 1000 but better: [0.85920578 0.83397683 0.84046693 0.86486486]