In [None]:
AUTO_TOPS = [
    "Sensing_Perception_VehicleUnderstanding",
    "Communication_Technologies",
    "Powertrain_Energy_Battery",
    "Manufacturing_Industrial_AI",
    "Robotic_Factory_Autonomous_Delivery",
    "Cybersecurity_Safety_Governance",
]


AUTO_AREA_SEEDS = {

    # 1) SENSING / PERCEPTION
    "Sensing_Perception_VehicleUnderstanding": {
        "seed": (
            "This area covers how a vehicle perceives and reconstructs its surrounding environment "
            "using sensors such as cameras, radars, lidars and inertial units. The goal is to detect "
            "lanes, vehicles, pedestrians, obstacles, free space and road structure by combining "
            "multi-sensor data into a unified spatial representation. It includes object detection, "
            "segmentation, depth estimation, multi-sensor fusion, 3D mapping, localization and "
            "environmental scene understanding for driving and navigation."
        )
    },

    # 3) CONNECTIVITY / NETWORKS (includes former software & backend aspects)
    "Communication_Technologies": {
        "seed": (
            "This area covers all communication between the vehicle, the infrastructure and the cloud. "
            "It includes cellular connectivity, V2X communication, vehicle-to-cloud data exchange, "
            "edge and fog computing integration and telematics services. It also spans in-vehicle data "
            "routing to gateways and central compute units, over-the-air update delivery, remote "
            "diagnostics and large-scale backend platforms that collect, process and distribute "
            "vehicle data for services and fleet-level coordination."
        )
    },

    # 4) POWERTRAIN / ENERGY / BATTERY  (data-driven wording removed)
    "Powertrain_Energy_Battery": {
        "seed": (
            "This area addresses how energy is generated, stored, managed and converted into vehicle "
            "motion. It includes battery management systems, charging strategies, electric motors, "
            "inverters, regenerative braking, hybrid powertrains, fuel cell systems and thermal "
            "management. Energy efficiency, range optimisation, degradation and ageing mechanisms, "
            "and grid-interaction concepts such as vehicle-to-grid belong to this domain."
        )
    },

    # 5) MANUFACTURING & INDUSTRIAL AI  (energy & data-driven emphasis removed)
    "Manufacturing_Industrial_AI": {
        "seed": (
            "This area covers artificial intelligence applied to vehicle production and factory "
            "operations. It includes vision-based quality inspection, defect detection, process "
            "monitoring, predictive maintenance for machines, production line balancing, scheduling, "
            "and automation of material handling. The focus is on intelligent, efficient and highly "
            "automated automotive manufacturing systems rather than energy or data analytics."
        )
    },

    # 6C) ROBOTIC FACTORY + AUTONOMOUS DELIVERY SYSTEMS (revived cluster)
    "Robotic_Factory_Autonomous_Delivery": {
        "seed": (
            "This area focuses on mobile and stationary robots that act as physical agents in factories "
            "and logistics. It includes robotic assembly in body and final assembly shops, autonomous "
            "mobile robots and automated guided vehicles for internal warehouse logistics, last-mile "
            "delivery robots and specialised mobile platforms equipped with end effectors, nozzles or "
            "stabiliser arms for industrial processing. Fleet coordination, indoor navigation and "
            "cooperation of multiple robotic units are central topics."
        )
    },

    # 7) CYBERSECURITY / SAFETY / GOVERNANCE
    "Cybersecurity_Safety_Governance": {
        "seed": (
            "This area covers protection of vehicle electronics, communication links and data against "
            "attacks and failures, together with governance and safety concepts. It includes secure "
            "boot and firmware integrity, cryptographic communication and key management, gateways and "
            "firewalls, intrusion detection and anomaly monitoring, secure over-the-air update "
            "mechanisms and backend security for connected vehicle services. Functional safety, "
            "redundancy concepts and compliance with safety and cybersecurity regulations are also "
            "part of this domain."
        )
    },

}


In [None]:
AUTO_TOP_SEEDS_1 = {
    "Lidar_Detection": "lidar based 3d object detection",
    "Radar_FF": "automotive mmwave radar feature fusion",
    "Camera_2D": "2d vision object detection yolo fasterrcnn",
    "Camera_3D": "monocular depth estimation and 3d bounding boxes",
    "Sensor_Fusion": "early and late sensor fusion architecture",
    "Occupancy_Grid": "neural implicit occupancy grid mapping",
    "SLAM": "visual inertial slam for autonomous vehicles",
    "Trajectory_Prediction": "vru pedestrian trajectory prediction",
    "Environment_Modeling": "scene graph based driving environment modeling"
}
AUTO_TOP_SEEDS_3 = {
    "V2X": "v2x communication stack c v2x 5g",
    "Telematics": "automotive telematics data pipeline",
    "Fleet_AI": "fleet management ai optimization",
    "Traffic_Coordination": "cooperative traffic signal coordination",
    "Ride_Sharing": "dynamic ride sharing demand prediction",
    "Mobility_Platforms": "mobility as a service cloud platform"
}
AUTO_TOP_SEEDS_4 = {
    "BMS": "battery management system diagnostics",
    "SOC_Estimation": "state of charge soc estimation algorithms",
    "SOH_Estimation": "state of health prediction lithium ion",
    "Thermal_Management": "battery thermal management",
    "Fast_Charging": "ev fast charging optimization",
    "Fuel_Cell": "hydrogen fuel cell powertrain control",
    "Inverter_Control": "inverter and motor control algorithms",
    "V2G": "bidirectional charging and vehicle to grid control",
    "Smart_Charging": "smart ev charging v2g integration"
}
AUTO_TOP_SEEDS_5 = {
        "Defect_Detection": "vision based defect inspection",
    "Production_AI": "production scheduling optimization ai"
}
AUTO_TOP_SEEDS_6C = {
    "Digital_Twin": "automotive digital twin simulation",
    "Zonal_Architecture": "zonal ee architecture for software defined vehicle",
    "Central_Compute": "central compute node and vehicle soc",
    "Vehicle_OS": "real time vehicle operating system"
}
AUTO_TOP_SEEDS_7 = {
    "IDS": "intrusion detection system for in vehicle networks",
    "Secure_Update": "secure boot and ota integrity",
    "Threat_Modeling": "automotive threat modeling",
    "PKI": "automotive pki cryptography",
    "ISO26262": "functional safety iso26262 analysis",
    "SOTIF": "intended functionality sotif framework",
    "Privacy_AI": "privacy preserving vehicle data processing"
}
AUTO_TOP_SEEDS = {
    "Sensing_Perception_VehicleUnderstanding": AUTO_TOP_SEEDS_1,
    "Communication_Technologies": AUTO_TOP_SEEDS_3,
    "Powertrain_Energy_Battery": AUTO_TOP_SEEDS_4,
    "Manufacturing_Industrial_AI": AUTO_TOP_SEEDS_5,
    "Robotic_Factory_Autonomous_Delivery": AUTO_TOP_SEEDS_6C,
    "Cybersecurity_Safety_Governance": AUTO_TOP_SEEDS_7,
}


In [None]:
# === Cell 1: imports & pathler ===
from pathlib import Path
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from umap import UMAP
import plotly.express as px

# Burada senin:
# AUTO_TOP8
# AUTO_AREA_SEEDS
# TECH_SEEDS_1..8 ve TECH_SEEDS
# zaten yukarıda tanımlı olsun.

DATA_DIR    = Path("../../01_data")
CORPUS_PATH = DATA_DIR / "predictive_model" / "df_auto_corpus_labeled.parquet"
OUT_DIR = Path("../../01_data") / "predictive_model"

MODEL_DIR   = Path("../../04_models/predictive_techname")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

EMB_PATH    = MODEL_DIR / "doc_embeddings_area_base.npy"


In [None]:
# === Cell 2: df yükle, filtrele, AREA ataması + embedding ===

df_full = pd.read_parquet(CORPUS_PATH)
print("ORIJINAL df shape:", df_full.shape)
print(df_full["source_type"].value_counts(), "\n")

# Sadece paper + patent
keep_types = ["paper", "patent"]
df = df_full[df_full["source_type"].isin(keep_types)].copy()
print("Sadece paper + patent df shape:", df.shape)
print(df["source_type"].value_counts(), "\n")

# ----- AREA encoder -----
encoder_area = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
print("Encoder dim:", encoder_area.get_sentence_embedding_dimension())

# Seed'lerden kategori vektörleri (AUTO_AREA_SEEDS)
area_cat_embeddings = {}
for label, subdict in AUTO_AREA_SEEDS.items():
    texts = list(subdict.values())
    emb = encoder_area.encode(
        texts,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False,
    )
    area_cat_embeddings[label] = emb.mean(axis=0)

cat_matrix = np.stack(list(area_cat_embeddings.values()))  # (8, dim)
cat_labels = list(area_cat_embeddings.keys())

print("cat_matrix shape:", cat_matrix.shape)
print("cat_labels:", cat_labels)

# ----- Doküman embedding'leri (doc_emb_non) -----
texts_non = df["text"].fillna("").tolist()
if EMB_PATH.exists():
    print(">> doc_embeddings_area_base.npy bulundu, yükleniyor...")
    doc_emb_non = np.load(EMB_PATH)
    if doc_emb_non.shape[0] != len(texts_non):
        print("!! UYARI: boyut uyuşmuyor, yeniden encode...")
        doc_emb_non = encoder_area.encode(
            texts_non,
            batch_size=64,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True,
        )
        np.save(EMB_PATH, doc_emb_non)
else:
    print(">> doc_embeddings_area_base.npy yok, encode ediliyor...")
    doc_emb_non = encoder_area.encode(
        texts_non,
        batch_size=64,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )
    np.save(EMB_PATH, doc_emb_non)

print("Final doc_emb_non shape:", doc_emb_non.shape)
assert doc_emb_non.shape[0] == len(texts_non)

# ----- AUTO_AREA_SEEDS ile AREA tahmini -----
sims = doc_emb_non @ cat_matrix.T   # (N_docs, 8)
rows = np.arange(sims.shape[0])

top1_idx    = sims.argmax(axis=1)
top1_scores = sims[rows, top1_idx]
top1_labels = [cat_labels[i] for i in top1_idx]   # '1_Sensing_...' vs.

# df'ye yaz
df["auto_top8_pred"] = top1_labels
df["seed_top1_sim"]  = top1_scores

# Numeric prefix'i ayır: label (1..8) + isim (TECH_SEEDS ile uyumlu)
df["auto_focus_label"] = (
    df["auto_top8_pred"].astype(str)
    .str.extract(r"^\s*(\d+)", expand=False)
)

df["auto_focus_area"] = (
    df["auto_top8_pred"]
    .astype(str)
    .str.replace(r"^\s*\d+\s*[_\-]\s*", "", regex=True)
)

print("AREA dağılımı (paper+patent):")
print(df["auto_focus_area"].value_counts())


In [None]:
# === Cell 3: UMAP 2D (tek kez), df_umap oluştur ===

umap_2d = UMAP(
    n_components=2,
    metric="cosine",
    random_state=42,
    n_neighbors=40,
    min_dist=0.1,
)

X_2d = umap_2d.fit_transform(doc_emb_non)

df_umap = (
    df.reset_index()
    .rename(columns={"index": "orig_index"})
    .assign(
        x_2d=X_2d[:, 0],
        y_2d=X_2d[:, 1],
    )
)

print("UMAP için doküman sayısı:", len(df_umap))
print(df_umap["auto_focus_area"].value_counts())


In [None]:
def wrap_vertical(text, max_len=40):
    if not isinstance(text, str):
        return ""
    return "<br>".join(
        [text[i:i+max_len] for i in range(0, len(text), max_len)]
    )

df_umap["text_vertical"] = df_umap["text"].apply(wrap_vertical)

In [None]:
# === Cell 4: Referans – sadece AREA renkli UMAP ===

fig = px.scatter(
    df_umap,
    x="x_2d",
    y="y_2d",
    color="auto_focus_area",
    hover_data=["source_type","text_vertical", "auto_focus_area"],
)

fig.update_traces(
    marker=dict(size=5, opacity=0.7),
    hoverlabel=dict(
        bgcolor="white",
        font_size=12,
        align="left"
    )
)
fig.update_layout(
    width=2200,
    height=900,
    legend_title_text="Area",
    dragmode="pan",
    xaxis=dict(autorange=True, fixedrange=False),
    yaxis=dict(autorange=True, fixedrange=False),
)

fig.show(config={"scrollZoom": True})


In [None]:
# === Cell 5: TECH_SEEDS ile her AREA içinde auto_tech_cluster ata ===

TECH_MATRICES = {}
TECH_LABELS   = {}

for area, tech_dict in AUTO_TOP_SEEDS.items():
    labels = list(tech_dict.keys())
    texts  = list(tech_dict.values())

    emb = encoder_area.encode(
        texts,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False,
    )

    TECH_MATRICES[area] = emb
    TECH_LABELS[area]   = labels

print("Tech matrices hazır. Alan sayısı:", len(TECH_MATRICES))

tech_preds = []

for i, row in enumerate(df_umap.itertuples()):
    area = row.auto_focus_area  # Örn: 'Software_Defined_Vehicle_Computing'

    tech_matrix = TECH_MATRICES.get(area)
    labels      = TECH_LABELS.get(area)

    if tech_matrix is None or labels is None:
        tech_preds.append(None)
        continue

    text_emb = doc_emb_non[i]           # (dim,)
    sims     = text_emb @ tech_matrix.T # (n_tech,)
    best_idx = int(np.argmax(sims))

    tech_preds.append(labels[best_idx])

df_umap["auto_tech_cluster"] = tech_preds

print(df_umap[["auto_focus_area", "auto_tech_cluster"]].head(20))


In [None]:
# === Cell 6: Area sabit, symbol = tech cluster; adalar birebir aynı ===

fig = px.scatter(
    df_umap,
    x="x_2d",
    y="y_2d",
    color="auto_tech_cluster",              # Büyük adalar = aynı
    symbol="auto_tech_cluster",           # İç parçalanma = tech
    hover_data=["title", "auto_focus_area", "auto_tech_cluster"],
)

fig.update_traces(marker=dict(size=5, opacity=0.7))
fig.update_layout(
    width=2200,
    height=900,
    legend_title_text="Area / Tech",
    dragmode="pan",
    xaxis=dict(autorange=True, fixedrange=False),
    yaxis=dict(autorange=True, fixedrange=False),
)

fig.show(config={"scrollZoom": True})

print("Granüler UMAP doküman sayısı:", len(df_umap))
print(df_umap["auto_focus_area"].value_counts())


In [None]:
df_umap.columns

In [None]:
cols = ["year", "month", "text", "source_type", "auto_focus_area", "auto_tech_cluster"]

df_auto_corpus_area_tech = df_umap[cols].copy()

save_path = OUT_DIR / "df_auto_corpus_area_tech.parquet"
df_auto_corpus_area_tech.to_parquet(save_path)

save_path