In [None]:
# === Cell 1: imports & paths ===
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt

DATA_DIR = Path("../data")

PATH_CORPUS_TOPICS = DATA_DIR / "df_corpus_with_topics.parquet"
PATH_TOPIC_TS       = DATA_DIR / "topic_trl_timeseries.parquet"


In [None]:
# === Cell 2: load data ===

# Topic-level TRL time series (ipynb4 çıktısı)
df_topic_ts = pd.read_parquet(PATH_TOPIC_TS)

# Orijinal doküman-level veri (source_type, topic_id vs. içerir)
df_corpus = pd.read_parquet(PATH_CORPUS_TOPICS)

df_topic_ts.head(), df_corpus.head()


In [None]:
# === Cell 3: ensure date/time types & basic filters ===
df_topic_ts = df_topic_ts.copy()
df_corpus   = df_corpus.copy()

df_topic_ts["date"] = pd.to_datetime(df_topic_ts["date"])
df_corpus["year"]   = df_corpus["year"].astype(int)
df_corpus["month"]  = df_corpus["month"].astype(int)
df_corpus["date"]   = pd.to_datetime(
    dict(year=df_corpus["year"], month=df_corpus["month"], day=1)
)

# İstersen sadece AI-related topic'lere filtre ekleyebilirsin:
# df_topic_ts = df_topic_ts[df_topic_ts["topic_name"].str.contains("AI", case=False, na=False)]


In [None]:
# === Cell 4: doc-type (paper / patent / news / nasa) share time series ===
# Amaç: her topic + ay için kaynak dağılımını çıkarmak

def build_topic_source_timeseries(df_corpus: pd.DataFrame) -> pd.DataFrame:
    """
    Output kolonları:
    topic_id, topic_name, date,
    n_total, n_paper, n_patent, n_news, n_nasa,
    share_paper, share_patent, share_news, share_nasa
    """
    # source_type'dan basit kategori haritası (gerekirse güncelle)
    def map_source(st):
        st = str(st).lower()
        if "paper" in st:
            return "paper"
        if "patent" in st:
            return "patent"
        if "news" in st:
            return "news"
        if "nasa" in st or "project" in st:
            return "nasa"
        return "other"

    df = df_corpus.copy()
    df["src_cat"] = df["source_type"].map(map_source)

    g = df.groupby(["topic_id", "topic_name", "date", "src_cat"]).size().reset_index(name="n")
    # pivot src_type
    pivot = g.pivot_table(
        index=["topic_id", "topic_name", "date"],
        columns="src_cat",
        values="n",
        fill_value=0
    ).reset_index()

    # kolon isimlerini düzelt
    pivot.columns.name = None
    for col in ["paper", "patent", "news", "nasa", "other"]:
        if col not in pivot.columns:
            pivot[col] = 0

    pivot["n_total"] = pivot[["paper", "patent", "news", "nasa", "other"]].sum(axis=1)

    for col in ["paper", "patent", "news", "nasa"]:
        pivot[f"share_{col}"] = np.where(
            pivot["n_total"] > 0,
            pivot[col] / pivot["n_total"],
            0.0
        )

    return pivot

df_topic_sources = build_topic_source_timeseries(df_corpus)
df_topic_sources.head()


In [None]:
# === Cell 5: merge TRL time series + source shares ===
# Ortak anahtar: topic_id, topic_name, date

df_ts_full = pd.merge(
    df_topic_ts,
    df_topic_sources,
    on=["topic_id", "topic_name", "date"],
    how="left"
)

df_ts_full.head()


In [None]:
# === Cell 6: küçük yardımcı – zaman serisini integer t olarak encode et ===
def prepare_time_series(group_df: pd.DataFrame, value_col: str):
    """
    group_df: tek bir topic için date'e göre sıralanmış df
    value_col: tahmin etmek istediğimiz kolon (örn. 'trl_mean', 'n_docs', 'share_paper')
    
    returns: t (int array), y (float array)
    """
    g = group_df.sort_values("date")
    g = g[[ "date", value_col ]].dropna()

    if len(g) < 4:
        return None, None

    # t = 0, 1, 2, ... şeklinde zaman index
    t = np.arange(len(g)).reshape(-1, 1)
    y = g[value_col].values.astype(float)
    return t, y


In [None]:
# === Cell 7: basit linear regression forecast fonksiyonu ===
def linear_forecast(group_df: pd.DataFrame, value_col: str, horizon: int = 12):
    """
    Tek topic + value_col için,
    gelecekteki 'horizon' ay için lineer forecast üretir.
    
    Sonuç: actual_df, forecast_df
    """
    g = group_df.sort_values("date")
    t, y = prepare_time_series(g, value_col)
    if t is None:
        return g, None  # veri yetmiyorsa forecast yok

    model = LinearRegression()
    model.fit(t, y)

    last_t = t[-1, 0]
    future_t = np.arange(last_t + 1, last_t + 1 + horizon).reshape(-1, 1)
    y_pred = model.predict(future_t)

    # tarihleri üret
    last_date = g["date"].max()
    future_dates = pd.date_range(
        start=last_date + pd.offsets.MonthBegin(1),
        periods=horizon,
        freq="MS"
    )

    forecast_df = pd.DataFrame({
        "date": future_dates,
        f"forecast_{value_col}": y_pred
    })
    return g, forecast_df


In [None]:
# === Cell 8: tek bir topic için örnek forecast & plot ===
sample_topic = df_ts_full["topic_id"].dropna().unique()[0]
tmp = df_ts_full[df_ts_full["topic_id"] == sample_topic].copy()

# TRL forecast
actual_trl, forecast_trl = linear_forecast(tmp, "trl_mean", horizon=12)

# volume forecast (n_docs)
actual_vol, forecast_vol = linear_forecast(tmp, "n_docs", horizon=12)

plt.figure(figsize=(8, 4))
plt.plot(actual_trl["date"], actual_trl["trl_mean"], label="actual TRL")
if forecast_trl is not None:
    plt.plot(forecast_trl["date"], forecast_trl["forecast_trl_mean"], linestyle="--", label="forecast TRL")
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

actual_trl.tail(), forecast_trl.head()


In [None]:
# === Cell 9: maturity forecast – hangi topic'ler TRL 6/7 üstüne çıkacak? ===
def forecast_topic_maturity(df_ts_full: pd.DataFrame, horizon: int = 12, trl_threshold: float = 6.0):
    rows = []

    for tid, g in df_ts_full.groupby("topic_id"):
        topic_name = g["topic_name"].iloc[0]
        actual, fore = linear_forecast(g, "trl_mean", horizon=horizon)
        if fore is None:
            continue

        # gelecek horizon içinde max TRL tahmini
        max_trl = fore["forecast_trl_mean"].max()
        last_trl = actual["trl_mean"].iloc[-1]

        rows.append({
            "topic_id": tid,
            "topic_name": topic_name,
            "last_date": actual["date"].max(),
            "last_trl_mean": last_trl,
            "forecast_max_trl_next_horizon": max_trl,
            "likely_to_cross_threshold": bool(max_trl >= trl_threshold)
        })

    return pd.DataFrame(rows)

df_maturity_forecast = forecast_topic_maturity(df_ts_full, horizon=12, trl_threshold=6.0)
df_maturity_forecast.sort_values("forecast_max_trl_next_horizon", ascending=False).head(20)


In [None]:
# === Cell 10: fastest-growing topics – volume slope bazlı sıralama ===
def estimate_topic_growth(df_ts_full: pd.DataFrame, horizon: int = 12):
    """
    Basit ölçü:
    - n_docs için lineer model → slope
    - Slope büyük olan topic'ler 'fastest growing'
    """
    rows = []

    for tid, g in df_ts_full.groupby("topic_id"):
        topic_name = g["topic_name"].iloc[0]
        t, y = prepare_time_series(g, "n_docs")
        if t is None:
            continue

        model = LinearRegression()
        model.fit(t, y)
        slope = float(model.coef_[0])

        rows.append({
            "topic_id": tid,
            "topic_name": topic_name,
            "growth_slope_n_docs": slope,
            "n_docs_last": y[-1]
        })

    return pd.DataFrame(rows)

df_growth = estimate_topic_growth(df_ts_full)
df_growth.sort_values("growth_slope_n_docs", ascending=False).head(20)


In [None]:
# === Cell 11: academy → application transition index forecast ===
def compute_transition_index(df_ts_full: pd.DataFrame, alpha: float = 1.0):
    """
    TransitionIndex(t) = (share_patent + share_news + share_nasa) - share_paper
    alpha: istersen ağırlık vermek için kullanılabilir.
    """
    df = df_ts_full.copy()
    df["transition_index"] = (
        (df.get("share_patent", 0.0) +
         df.get("share_news", 0.0) +
         df.get("share_nasa", 0.0)) * alpha
        - df.get("share_paper", 0.0)
    )
    return df

df_ts_full = compute_transition_index(df_ts_full)
df_ts_full[["topic_id", "topic_name", "date", "transition_index"]].head()


In [None]:
# === Cell 12: transition forecast – gelecekte application'a kayma beklenen topic'ler ===
def forecast_topic_transition(df_ts_full: pd.DataFrame, horizon: int = 12):
    rows = []

    for tid, g in df_ts_full.groupby("topic_id"):
        topic_name = g["topic_name"].iloc[0]
        actual, fore = linear_forecast(g, "transition_index", horizon=horizon)
        if fore is None:
            continue

        last_val = actual["transition_index"].iloc[-1]
        future_mean = fore["forecast_transition_index"].mean()

        rows.append({
            "topic_id": tid,
            "topic_name": topic_name,
            "last_transition_index": last_val,
            "forecast_transition_index_mean": future_mean,
            "likely_to_shift_to_application": bool(future_mean > 0 and future_mean > last_val)
        })

    return pd.DataFrame(rows)

df_transition_forecast = forecast_topic_transition(df_ts_full, horizon=12)
df_transition_forecast.sort_values("forecast_transition_index_mean", ascending=False).head(20)


In [None]:
# === Cell 13: save summary tables ===
df_maturity_forecast.to_parquet(DATA_DIR / "forecast_topic_maturity.parquet", index=False)
df_growth.to_parquet(DATA_DIR / "forecast_topic_growth.parquet", index=False)
df_transition_forecast.to_parquet(DATA_DIR / "forecast_topic_transition.parquet", index=False)

print("Saved:")
print(DATA_DIR / "forecast_topic_maturity.parquet")
print(DATA_DIR / "forecast_topic_growth.parquet")
print(DATA_DIR / "forecast_topic_transition.parquet")