In [1]:
# === Cell 1: imports & paths ===
import os
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

CORPUS_PATH = DATA_DIR / "df_corpus.parquet"



In [5]:
# === Cell 2: df_corpus'u yükle veya oluştur ===
# Eğer zaten parquet'in varsa:
#df_corpus = pd.read_parquet(CORPUS_PATH)

# Eğer yoksa, burada kaynak csv/parquet'leri okuyup concat edersin.
# Örn:
df_nasa = pd.read_csv(DATA_DIR / "nasaproject_corpus.csv")
df_news = pd.read_csv(DATA_DIR / "news_ai_corpus_2015_2025.csv")
df_paper = pd.read_csv(DATA_DIR / "paper_ai_corpus_2015_2025.csv")
df_patent  = pd.read_csv(DATA_DIR / "patent_ai_corpus_2015_2025.csv")
df_corpus = pd.concat([df_nasa, df_news, df_paper, df_patent], ignore_index=True)

df_corpus.head()


Unnamed: 0,title,abstract,detailed_text,year,month,source_type,tech_field,trl
0,High TRL Rover Lidar,Design and build a LIDAR engineering test unit...,The Space Qualified Rover Light Detection and ...,2024,10,nasa_project,Sensing and Perception for Autonomous Systems,6.0
1,Standardizing a Data and Power System for GSFC...,There are three primary Tasks associated with ...,"To our knowledge, an NO2 sonde has never been ...",2024,10,nasa_project,Environment Sensors,7.0
2,Development of ACADIA-to-CCD Camera Platform,Develop a platform to adapt the Goddard-develo...,The project will benefit systems and instrumen...,2024,10,nasa_project,Detectors and Focal Planes,4.0
3,Atom Interferometer Gravity Gradiometer Techno...,Cold atom interferometers enable measurements ...,The AIGG laser system lacks a path toward spac...,2024,10,nasa_project,Lasers,4.0
4,Spaceflight Compatible Optical Atomic Strontiu...,Optical atomic clocks are critical tools for N...,OASIC leverages high accuracy timing and exqui...,2024,10,nasa_project,"Communications, Navigation, and Orbital Debris...",2.0


In [6]:
# === Cell 3: temel kolon kontrolleri ===
expected_cols = [
    "title", "abstract", "detailed_text",
    "year", "month",
    "source_type",   # 'nasa_project', 'news', 'paper', 'patent' gibi
    "tech_field",    # teknoloji alanı
    "trl"            # NASA'da dolu, diğerlerinde NaN
]

missing = [c for c in expected_cols if c not in df_corpus.columns]
print("Missing columns:", missing)
df_corpus.dtypes


Missing columns: []


title             object
abstract          object
detailed_text     object
year               int64
month              int64
source_type       object
tech_field        object
trl              float64
dtype: object

In [17]:
# === Cell 4: text kolonu oluştur (title + abstract + detailed_text) ===
def safe_str(x):
    return "" if pd.isna(x) else str(x)

df_corpus["text"] = (
    df_corpus["title"].map(safe_str) + ". " +
    df_corpus["abstract"].map(safe_str) + " " +
    df_corpus["detailed_text"].map(safe_str)
).str.strip()

# TRL'yi float → int yap (varsa)
if "trl" in df_corpus.columns:
    df_corpus["trl"] = df_corpus["trl"].astype("float").round().astype("Int64")

df_corpus["source_type"] = df_corpus["source_type"].astype("string")
df_corpus["tech_field"]  = df_corpus["tech_field"].astype("string")
df_corpus["text"] = df_corpus["text"].astype("string")
df_corpus.sample(5)


Unnamed: 0,title,abstract,detailed_text,year,month,source_type,tech_field,trl,text
21521,Ptc Is A Top Stock To Buy For Industrial Techn...,This computer-aided-design company has made bi...,This computer-aided-design company has made bi...,2022,12,news,internet_of_things,,Ptc Is A Top Stock To Buy For Industrial Techn...
5760,MarsOasis - An Efficient Autonomously Controll...,The MarsOasis™ cultivation system is a versati...,-MarsOasis™ provides fresh food to spacecraft ...,2024,1,nasa_project,"Food Production, Processing, and Preservation",5.0,MarsOasis - An Efficient Autonomously Controll...
9977,Automated High-Volume Manufacturing of Modular...,"Deployable Space Systems, Inc. (DSS) will focu...",NASA space applications are comprised of pract...,2024,1,nasa_project,Photovoltaic Electrical Power,4.0,Automated High-Volume Manufacturing of Modular...
3960,Techniques to Support the Aerial Deployment an...,The proposed effort focuses on two critical ph...,The principal commercial application of this P...,2024,1,nasa_project,Surface Mobility,6.0,Techniques to Support the Aerial Deployment an...
170,End-to-End Trajectory Optimization,Human exploration of the Solar System is in ou...,Why do we want end-to-end optimization? Tradit...,2024,7,nasa_project,Trajectory Design and Analysis,5.0,End-to-End Trajectory Optimization. Human expl...


In [18]:
df_untechfield = df_corpus[df_corpus["tech_field"].isna()].copy()
df_untechfield

Unnamed: 0,title,abstract,detailed_text,year,month,source_type,tech_field,trl,text
82,QuERI – Quantitative Elemental Reconnaissance ...,Bulk geochemical analyses of planetary surface...,,2024,9,nasa_project,,,QuERI – Quantitative Elemental Reconnaissance ...
12414,Measurement of Secondary Electron Yield (SEY) ...,Secondary Electron Yield (SEY) is a material p...,,2023,12,nasa_project,,,Measurement of Secondary Electron Yield (SEY) ...
13769,Probing Exoplanet Atmospheric Physics with the...,We propose to build and fly the EXoplanet Infr...,,2022,11,nasa_project,,,Probing Exoplanet Atmospheric Physics with the...
13770,Rockets for Extended Source Soft X-ray Spectro...,The soft X-ray background surrounds our local ...,,2022,11,nasa_project,,,Rockets for Extended Source Soft X-ray Spectro...
13771,EUSO-SPB2: second generation Extreme Universe ...,This is the lead Institution Co-Investigator P...,,2022,11,nasa_project,,,EUSO-SPB2: second generation Extreme Universe ...
...,...,...,...,...,...,...,...,...,...
24147,Accelerated Bayesian Calibration and Uncertain...,"Abstract In operational weather models, the ef...","Abstract In operational weather models, the ef...",2025,11,paper,,,Accelerated Bayesian Calibration and Uncertain...
24148,An Approach to AI High-Velocity Development Th...,ABSTRACTAI-assisted development promises subst...,ABSTRACTAI-assisted development promises subst...,2025,11,paper,,,An Approach to AI High-Velocity Development Th...
24149,Campus Resonance AI: A Recursive Communication...,CampusPulse AI: A Recursive Communication Arch...,CampusPulse AI: A Recursive Communication Arch...,2025,11,paper,,,Campus Resonance AI: A Recursive Communication...
24150,The Wayfinder Protocol,"This is Draft 2 of The Wayfinder Protocol, a n...","This is Draft 2 of The Wayfinder Protocol, a n...",2025,11,paper,,,The Wayfinder Protocol. This is Draft 2 of The...


In [13]:
df_corpus.dtypes

title                    object
abstract                 object
detailed_text            object
year                      int64
month                     int64
source_type      string[python]
tech_field       string[python]
trl                       Int64
text             string[python]
dtype: object

In [9]:
# === Cell 5: NASA vs diğerlerini incele ===
df_nasa = df_corpus[df_corpus["source_type"] == "nasa_project"].copy()
df_unlabeled = df_corpus[df_corpus["trl"].isna()].copy()

print("Toplam:", len(df_corpus))
print("NASA:", len(df_nasa))
print("Label'sız:", len(df_unlabeled))

df_nasa["trl"].value_counts().sort_index()


Toplam: 25226
NASA: 19631
Label'sız: 10153


trl
0      76
1     122
2     799
3    3615
4    4532
5    2528
6    2264
7     757
8     234
9     146
Name: count, dtype: Int64

In [10]:
# === Cell 6: kaydet ===
df_corpus.to_parquet(CORPUS_PATH, index=False)
print("Saved to:", CORPUS_PATH)


Saved to: ..\data\df_corpus.parquet
