In [17]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Read in the cleaned data
data_prefix = 'https://raw.githubusercontent.com/nasa-petal/search-engine/main/data/'
df = pd.read_csv(data_prefix + 'cleaned.csv')

# convert array strings to actual arrays
from ast import literal_eval
df['y'] = df['y'].apply(literal_eval)
df.head()

# Drop all non-feature columns
df = df[['y','text']]
df.head()

Unnamed: 0,y,text
0,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0]",building a home from foam tungara frog foam ne...
1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",a nocturnal mammal the greater mouse eared bat...
2,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",polarization sensitivity in two species of cut...
3,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0]",identification and characterization of a multi...
4,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",differences in polysaccharide structure betwee...


In [18]:
from sklearn.model_selection import train_test_split

In [24]:
df['word_count'] = df.text.apply(lambda x: len(x.split(" ")))

In [39]:
df = df[df.word_count > 50]

In [49]:
df = df[:500]

In [50]:
X = df.text
y = df.y

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
lotclass_path = "LOTClass/datasets/petal/"

In [53]:
file = open(lotclass_path + "/train.txt", "w+")
for x in X_train:
    file.write(x.replace("||||| ", ""))
    file.write("\n")
file.close()

In [54]:
file = open(lotclass_path + "test.txt", "w+")
for x in X_test:
    file.write(x.replace("||||| ", ""))
    file.write("\n")
file.close()

In [57]:
file = open(lotclass_path + "test_labels.txt", "w+")
for y in y_test:
    file.write(str(y.index(1)))
    file.write("\n")
file.close()

In [None]:
## Run petal.sh to build the model

In [None]:
from tempfile import mkdtemp
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
#from lightning.classification import LinearSVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate
#from skmultilearn.model_selection import IterativeStratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

RANDOM_STATE = 42

tfidf = TfidfVectorizer(strip_accents="ascii", analyzer="char", ngram_range=(1, 5), max_features=500000)
scaler = MaxAbsScaler(copy=False)
estimator = Pipeline(
    [("tfidf", tfidf), ("scaler", scaler), ("svc", OneVsRestClassifier(LinearSVC(loss="squared_hinge", random_state=RANDOM_STATE)))],
    memory=mkdtemp(),
)
p_grid = {"svc__estimator__C": [5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1, 1e0, 5e0, 1e1, 5e1]} # 

#inner_cv = IterativeStratification(n_splits=4, order=1)
#outer_cv = IterativeStratification(n_splits=4, order=1)
inner_cv = MultilabelStratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)
outer_cv = MultilabelStratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)
scoring = ['average_precision', 'precision_macro', 'recall_macro', 'f1_macro', 'accuracy', 'multilabel_confusion_matrix', 'classification_report']

# Nested CV with parameter optimization
clf = GridSearchCV(estimator=estimator, scoring="average_precision", param_grid=p_grid, cv=inner_cv)
nested_score = cross_validate(clf, X=df["text"], y=df["y"].tolist(), scoring="average_precision", cv=outer_cv)
print(f"The mean MAP score using nested cross-validation is: "
      f"{nested_score['test_score'].mean():.3f} +/- {nested_score['test_score'].std():.3f}")

The mean MAP score using nested cross-validation is: 0.627 +/- 0.018
