In [1]:
import pandas as pd

full_data = pd.read_csv(r"smaller_datasets/full_data_small.csv")
test_data = pd.read_csv(r"smaller_datasets/test_data_small.csv")
train_data = pd.read_csv(r"smaller_datasets/train_data_small.csv")
valid_data = pd.read_csv(r"smaller_datasets/valid_data_small.csv")

# print("full data")
# print(full_data)
# print("test data")
# print(test_data)
# print("train data")
# print(train_data)
# print("valid data")
# print(valid_data)

In [2]:
# %pip install transformers
# %pip install tqdm

In [3]:
import os, re, json, warnings
from typing import Optional, List, Dict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModel

RND = 42
np.random.seed(RND)
torch.manual_seed(RND)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x18915ef5190>

In [4]:
full_data.columns

Index(['TEXT', 'LOCATION', 'LABEL'], dtype='object')

In [5]:
sample_abstract_id = None
sample_text = None
sample_labels = None
sample_locations = None

for _, row in full_data[1:2].iterrows():
    for col in full_data.columns:
        if col == "TEXT":
            sample_text = row[col]
        elif col == "LABEL":
            sample_labels = row[col]
        if col == "LOCATION":
            sample_locations = row[col]
    break

In [6]:
def expand_dataset(df):
    rows = []  # Use a different name for the list to collect new rows
    for _, row_data in df.iterrows():
        text = row_data['TEXT']
        labels = row_data['LABEL'].split("|")
        locations = row_data['LOCATION'].split("|")
        tokens = text.split()
        for loc, label in zip(locations, labels):
            try:
                idx = int(loc)
                if 0 <= idx < len(tokens):
                    token = tokens[idx]
                    rows.append((text, loc, token, label))
            except ValueError:
                # Skip if location is not a valid integer
                continue
    return pd.DataFrame(rows, columns=['TEXT', 'LOCATION', 'ABBREV', 'LABEL'])

new_full_data = expand_dataset(full_data)
new_full_data

Unnamed: 0,TEXT,LOCATION,ABBREV,LABEL
0,alphabisabolol has a primary antipeptic action...,56,ATP,substrate
1,a report is given on the recent discovery of o...,24,CS,carcinosarcoma
2,a report is given on the recent discovery of o...,49,REC,recovery
3,a report is given on the recent discovery of o...,68,REF,reference
4,a report is given on the recent discovery of o...,113,REC,recovery
...,...,...,...,...
3036,three homogeneous groups of patients with sili...,51,CF,function
3037,three homogeneous groups of patients with sili...,108,CF,function
3038,pyometra is a disorder of the uterus usually a...,13,one,obstruction
3039,pyometra is a disorder of the uterus usually a...,14,C1,large


In [7]:
label_counts = new_full_data["LABEL"].value_counts()

# Filter out labels that appear only once
rare_labels = label_counts[label_counts < 3].index
print(f"Removing {len(rare_labels)} rare labels with less than 3 occurrences")

# Create a filtered dataset without rare labels
filtered_data = new_full_data[~new_full_data["LABEL"].isin(rare_labels)]
print(f"Filtered data shape: {filtered_data.shape} (removed {new_full_data.shape[0] - filtered_data.shape[0]} rows)")


Removing 623 rare labels with less than 3 occurrences
Filtered data shape: (2284, 4) (removed 757 rows)


In [8]:
new_full_data = filtered_data
labels = sorted(new_full_data["LABEL"].unique())
label2id = {lbl: i for i, lbl in enumerate(labels)}
id2label = {i: lbl for lbl, i in label2id.items()}
new_full_data["LABEL_ID"] = new_full_data["LABEL"].map(label2id)
# Train/val/test split (80/10/10)
train_df, test_df = train_test_split(new_full_data, test_size=0.2, random_state=42, stratify=new_full_data["LABEL_ID"])
train_df, val_df  = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df["LABEL_ID"])

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)


Train: (1461, 5) Val: (366, 5) Test: (457, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_full_data["LABEL_ID"] = new_full_data["LABEL"].map(label2id)


In [10]:
def concat_for_tfidf(row):
    return row["TEXT"] + " [ABBR] " + row["ABBREV"]

train_texts = train_df['TEXT'].tolist()
val_texts   = val_df['TEXT'].tolist()
test_texts  = test_df['TEXT'].tolist()

y_train, y_val, y_test = train_df["LABEL_ID"], val_df["LABEL_ID"], test_df["LABEL_ID"]

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=50000, min_df=2)),
    ("clf", LogisticRegression(max_iter=1000, solver="saga", n_jobs=-1))
])

print("Training TF-IDF + Logistic Regression...")
pipeline.fit(train_texts, y_train)

print("\nValidation results:")
y_pred = pipeline.predict(val_texts)



Training TF-IDF + Logistic Regression...

Validation results:

Validation results:


In [12]:
print("Accuracy:", accuracy_score(y_val, y_pred))
# Get only the labels present in validation data
val_labels = sorted(y_val.unique())
print(classification_report(y_val, y_pred, 
						   target_names=[id2label[i] for i in val_labels],
						   labels=val_labels))

print("\nTest results:")
y_pred = pipeline.predict(test_texts)
print("Accuracy:", accuracy_score(y_test, y_pred))
# Get only the labels present in test data
test_labels = sorted(y_test.unique())
print(classification_report(y_test, y_pred, 
						   target_names=[id2label[i] for i in test_labels],
						   labels=test_labels))

Accuracy: 0.12021857923497267
                                    precision    recall  f1-score   support

                         activated       0.00      0.00      0.00         1
                            active       0.25      0.38      0.30         8
                        activities       0.20      0.20      0.20         5
                      administered       0.00      0.00      0.00         1
                             adult       0.00      0.00      0.00         3
                             after       0.06      0.79      0.12        19
                    afterdischarge       0.00      0.00      0.00         1
                           albumin       0.00      0.00      0.00         1
                          alkaline       1.00      0.20      0.33         5
                             alone       0.00      0.00      0.00         2
                        amino acid       0.00      0.00      0.00         2
                  ammonium sulfate       0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Accuracy: 0.0962800875273523
                                    precision    recall  f1-score   support

                     abnormalities       0.00      0.00      0.00         1
                           acetone       0.00      0.00      0.00         1
                         activated       0.00      0.00      0.00         2
                            active       0.00      0.00      0.00        10
                        activities       0.50      0.14      0.22         7
                      administered       0.00      0.00      0.00         1
                             adult       0.00      0.00      0.00         3
                             after       0.06      0.79      0.11        24
                         agreement       0.00      0.00      0.00         1
                           albumin       0.00      0.00      0.00         1
                          alkaline       1.00      0.17      0.29         6
                             alone       0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
