In [1]:
# === Cell 1: imports & paths ===
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
)

DATA_DIR = Path("../data")
CORPUS_PATH = DATA_DIR / "df_corpus.parquet"
MODEL_DIR  = Path("../models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
device


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

'cuda'

In [2]:
# === Cell 2: df_corpus ve NASA subset ===
df_corpus = pd.read_parquet(CORPUS_PATH)

df_nasa = df_corpus[
    (df_corpus["source_type"] == "nasa_project") & 
    (df_corpus["trl"].notna())
].copy()

# TRL sayısal hale gelsin
df_nasa["trl_int"] = df_nasa["trl"].astype(int)

# 0–3 → low, 4–6 → mid, 7–9 → high
def trl_to_band(x: int) -> str:
    if x <= 3:
        return "low"
    elif x <= 6:
        return "mid"
    else:
        return "high"

df_nasa["trl_band"] = df_nasa["trl_int"].map(trl_to_band)

# 3 sınıflı label mapping
label2id = {"low": 0, "mid": 1, "high": 2}
id2label = {v: k for k, v in label2id.items()}

df_nasa["label"] = df_nasa["trl_band"].map(label2id)

df_nasa[["text", "trl_int", "trl_band", "label"]].head()


Unnamed: 0,text,trl_int,trl_band,label
0,High TRL Rover Lidar. Design and build a LIDAR...,6,mid,1
1,Standardizing a Data and Power System for GSFC...,7,high,2
2,Development of ACADIA-to-CCD Camera Platform. ...,4,mid,1
3,Atom Interferometer Gravity Gradiometer Techno...,4,mid,1
4,Spaceflight Compatible Optical Atomic Strontiu...,2,low,0


In [3]:
# === Cell 3: train/val split ===
train_df, temp_df = train_test_split(
    df_nasa, test_size=0.40, stratify=df_nasa["label"], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df["label"], random_state=42
)

len(train_df), len(val_df), len(test_df)


(9043, 3015, 3015)

In [4]:
# === Cell 4: HuggingFace Dataset + tokenizer ===
model_name = "allenai/scibert_scivocab_uncased"  # istersen değiştir
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=512
    )

train_ds = Dataset.from_pandas(train_df[["text", "label"]])
val_ds   = Dataset.from_pandas(val_df[["text", "label"]])
test_ds  = Dataset.from_pandas(test_df[["text", "label"]])

train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds   = val_ds.map(tokenize_fn, batched=True)
test_ds  = test_ds.map(tokenize_fn, batched=True)  

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)




Map:   0%|          | 0/9043 [00:00<?, ? examples/s]

Map:   0%|          | 0/3015 [00:00<?, ? examples/s]

Map:   0%|          | 0/3015 [00:00<?, ? examples/s]

In [6]:
# === Cell 5: model & training args ===
num_labels = 3

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
).to(device)

training_args = TrainingArguments(
    output_dir=str(MODEL_DIR / "trl_clf_scibert"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=False
)


  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# === Cell 6: metrics ===
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "macro_f1": macro_f1}


In [8]:
import torch
torch.cuda.empty_cache()

In [9]:
# === Cell 7: Trainer & train ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model(MODEL_DIR / "trl_clf_scibert_best")
tokenizer.save_pretrained(MODEL_DIR / "trl_clf_scibert_best")


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/3390 [00:00<?, ?it/s]

{'loss': 0.889, 'grad_norm': 7.047276020050049, 'learning_rate': 1.970501474926254e-05, 'epoch': 0.04}
{'loss': 0.8195, 'grad_norm': 8.088150978088379, 'learning_rate': 1.9410029498525075e-05, 'epoch': 0.09}
{'loss': 0.8518, 'grad_norm': 9.084348678588867, 'learning_rate': 1.9115044247787613e-05, 'epoch': 0.13}
{'loss': 0.8136, 'grad_norm': 3.996556043624878, 'learning_rate': 1.8820058997050148e-05, 'epoch': 0.18}
{'loss': 0.7804, 'grad_norm': 4.114112377166748, 'learning_rate': 1.8525073746312686e-05, 'epoch': 0.22}
{'loss': 0.7909, 'grad_norm': 10.879318237304688, 'learning_rate': 1.823008849557522e-05, 'epoch': 0.27}
{'loss': 0.8238, 'grad_norm': 8.980223655700684, 'learning_rate': 1.793510324483776e-05, 'epoch': 0.31}
{'loss': 0.751, 'grad_norm': 6.700864315032959, 'learning_rate': 1.7640117994100297e-05, 'epoch': 0.35}
{'loss': 0.8179, 'grad_norm': 5.725013256072998, 'learning_rate': 1.7345132743362835e-05, 'epoch': 0.4}
{'loss': 0.7672, 'grad_norm': 10.172341346740723, 'learning_

  0%|          | 0/754 [00:00<?, ?it/s]

{'eval_loss': 0.7206190824508667, 'eval_accuracy': 0.6812603648424544, 'eval_macro_f1': 0.4003517617648053, 'eval_runtime': 428.657, 'eval_samples_per_second': 7.034, 'eval_steps_per_second': 1.759, 'epoch': 1.0}
{'loss': 0.7253, 'grad_norm': 9.75368595123291, 'learning_rate': 1.321533923303835e-05, 'epoch': 1.02}
{'loss': 0.7346, 'grad_norm': 9.301448822021484, 'learning_rate': 1.2920353982300886e-05, 'epoch': 1.06}
{'loss': 0.6155, 'grad_norm': 6.5749311447143555, 'learning_rate': 1.2625368731563424e-05, 'epoch': 1.11}
{'loss': 0.723, 'grad_norm': 12.463732719421387, 'learning_rate': 1.233038348082596e-05, 'epoch': 1.15}
{'loss': 0.6707, 'grad_norm': 10.472442626953125, 'learning_rate': 1.2035398230088497e-05, 'epoch': 1.19}
{'loss': 0.557, 'grad_norm': 8.810693740844727, 'learning_rate': 1.1740412979351032e-05, 'epoch': 1.24}
{'loss': 0.5838, 'grad_norm': 21.544025421142578, 'learning_rate': 1.144542772861357e-05, 'epoch': 1.28}
{'loss': 0.6846, 'grad_norm': 3.927950143814087, 'lear

  0%|          | 0/754 [00:00<?, ?it/s]

{'eval_loss': 0.7143566012382507, 'eval_accuracy': 0.6965174129353234, 'eval_macro_f1': 0.584143042969389, 'eval_runtime': 115.1939, 'eval_samples_per_second': 26.173, 'eval_steps_per_second': 6.545, 'epoch': 2.0}
{'loss': 0.521, 'grad_norm': 32.6551628112793, 'learning_rate': 6.430678466076696e-06, 'epoch': 2.03}
{'loss': 0.4327, 'grad_norm': 18.148717880249023, 'learning_rate': 6.135693215339233e-06, 'epoch': 2.08}
{'loss': 0.4185, 'grad_norm': 11.718209266662598, 'learning_rate': 5.840707964601771e-06, 'epoch': 2.12}
{'loss': 0.4059, 'grad_norm': 16.70403480529785, 'learning_rate': 5.545722713864308e-06, 'epoch': 2.17}
{'loss': 0.4739, 'grad_norm': 10.035493850708008, 'learning_rate': 5.250737463126844e-06, 'epoch': 2.21}
{'loss': 0.4408, 'grad_norm': 12.445578575134277, 'learning_rate': 4.955752212389381e-06, 'epoch': 2.26}
{'loss': 0.4258, 'grad_norm': 15.770271301269531, 'learning_rate': 4.660766961651918e-06, 'epoch': 2.3}
{'loss': 0.3732, 'grad_norm': 17.716676712036133, 'learn

  0%|          | 0/754 [00:00<?, ?it/s]

{'eval_loss': 0.8114001750946045, 'eval_accuracy': 0.714759535655058, 'eval_macro_f1': 0.6234600586189429, 'eval_runtime': 115.2174, 'eval_samples_per_second': 26.168, 'eval_steps_per_second': 6.544, 'epoch': 3.0}
{'train_runtime': 4915.6872, 'train_samples_per_second': 5.519, 'train_steps_per_second': 0.69, 'train_loss': 0.6077714990373909, 'epoch': 3.0}


('..\\models\\trl_clf_scibert_best\\tokenizer_config.json',
 '..\\models\\trl_clf_scibert_best\\special_tokens_map.json',
 '..\\models\\trl_clf_scibert_best\\vocab.txt',
 '..\\models\\trl_clf_scibert_best\\added_tokens.json',
 '..\\models\\trl_clf_scibert_best\\tokenizer.json')

In [10]:
# === Cell 8: Test metrics evaluation ===
preds = trainer.predict(test_ds)
logits = preds.predictions
test_preds = logits.argmax(axis=1)

from sklearn.metrics import accuracy_score, f1_score
test_acc = accuracy_score(test_df["label"], test_preds)
test_f1 = f1_score(test_df["label"], test_preds, average="macro")

print("Test Accuracy:", test_acc)
print("Test Macro-F1:", test_f1)

  0%|          | 0/754 [00:00<?, ?it/s]

Test Accuracy: 0.7263681592039801
Test Macro-F1: 0.6260253785206077


In [12]:
# === Cell 9: Label'sız korpusa pseudo-label verme ===
df_unlabeled = df_corpus[df_corpus["trl"].isna()].copy()
unlabeled_ds = Dataset.from_pandas(df_unlabeled[["text"]])
unlabeled_ds = unlabeled_ds.map(tokenize_fn, batched=True)

preds = trainer.predict(unlabeled_ds)
logits = preds.predictions
probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
conf = probs.max(axis=1)
pred_ids = probs.argmax(axis=1)

df_unlabeled["pseudo_label_id"] = pred_ids.astype(int)
df_unlabeled["pseudo_trl"] = df_unlabeled["pseudo_label_id"].map(id2label)
df_unlabeled["pseudo_conf"] = conf

df_unlabeled[["text", "pseudo_trl", "pseudo_conf"]].head()


Map:   0%|          | 0/10153 [00:00<?, ? examples/s]

  0%|          | 0/2539 [00:00<?, ?it/s]

Unnamed: 0,text,pseudo_trl,pseudo_conf
65,Expert-Informed Autonomous Science Planning fo...,low,0.982704
81,Saltation Sensor to TRL6. Aeolian processes ar...,low,0.515622
82,QuERI – Quantitative Elemental Reconnaissance ...,mid,0.962406
86,OrganiCam: A Light-Weight Standoff Time-Resolv...,mid,0.975897
89,Seismometer to Investigate Interior Asteroid S...,mid,0.66371


In [14]:
# === Cell 10: high-confidence pseudo-label'ları ekle ===
import numpy as np
HIGH_TH = 0.85  # istersen ayarla

df_pseudo_strong = df_unlabeled[df_unlabeled["pseudo_conf"] >= HIGH_TH].copy()

# band: low / mid / high
df_pseudo_strong["trl_band"] = df_pseudo_strong["pseudo_trl"]

# model label id'si
df_pseudo_strong["label"] = df_pseudo_strong["pseudo_label_id"]

print("Strong pseudo-labeled samples:", len(df_pseudo_strong))

# NASA tarafı: zaten trl_int veya trl var
df_nasa["trl"] = df_nasa["trl"].astype(float)      # 1–9 numeric
df_nasa["trl_band"] = df_nasa["trl_band"]          # low/mid/high, zaten string
# df_nasa["label"] zaten 0/1/2

# NASA + pseudo birleşmiş final label'lı korpus
df_labeled_final = pd.concat(
    [df_nasa, df_pseudo_strong],
    ignore_index=True
)

df_labeled_final.to_parquet(DATA_DIR / "df_corpus_labeled.parquet", index=False)
print("Saved:", DATA_DIR / "df_corpus_labeled.parquet")


Strong pseudo-labeled samples: 4266


  df_labeled_final = pd.concat(


Saved: ..\data\df_corpus_labeled.parquet
