## TEMPORAL EDA & DATA PREPARATION FOR STREAMLIT.APP 

This notebook performs exploratory data analysis (EDA) by generating basic time-series visualizations using the cleaned dataset. In addition, it prepares the base data structures required for visualizations and analytics to be used in the Streamlit application.

In [None]:
# === Cell 1: imports & paths ===
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

DATA_DIR = Path("../../01_data") / "predictive_model"
CORPUS_PATH = DATA_DIR / "df_auto_corpus_tech_free.parquet"



In [None]:
# === Cell 2: load + basic filters (paper/patent + date) ===

df = pd.read_parquet(CORPUS_PATH)


df = df[df["source_type"].isin(["paper", "patent"])].copy()


df = df.dropna(subset=["year", "month"])

df["year"] = df["year"].astype(int)
df["month"] = df["month"].astype(int)


df["date"] = pd.to_datetime(
    dict(year=df["year"], month=df["month"], day=1)
)

df.head()


In [None]:
# === Cell 3: Data preparation and Rough EDA 

m = (df.set_index("date")
       .groupby("source_type")
       .resample("MS")
       .size()
       .unstack(0, fill_value=0)
       .sort_index())

for c in ["paper", "patent"]:
    if c not in m.columns:
        m[c] = 0

m["total"] = m["paper"] + m["patent"]

ROLL = 9
m["total_roll"]  = m["total"].rolling(ROLL, min_periods=1).mean()
m["patent_roll"] = m["patent"].rolling(ROLL, min_periods=1).mean()

# Activity growth = YoY change in total activity
m["activity_growth_yoy"] = m["total_roll"].pct_change(12) * 100

# =========================
# Plot
# =========================
fig, ax1 = plt.subplots(figsize=(13.5, 5))

# --- Total activity volume
ax1.fill_between(
    m.index, m["total_roll"],
    alpha=0.18, color="#4C72B0"
)
line_total, = ax1.plot(
    m.index, m["total_roll"],
    linewidth=2.4, color="#4C72B0",
    label="Total activity (papers + patents)"
)

# --- Patent count
line_patent, = ax1.plot(
    m.index, m["patent_roll"],
    linewidth=2.3, color="#55A868",
    label="Patent count"
)

ax1.set_ylabel("Documents per month (rolling)")
ax1.set_xlabel("Date")
ax1.set_title("Research & IP Activity Over Time")

# --- Activity growth (YoY) â€“ less faded
line_growth, = ax1.plot(
    m.index, m["activity_growth_yoy"],
    linestyle="--",
    linewidth=1.6,
    color="#7F7F7F",
    alpha=0.65,
    label="Activity growth rate (YoY)"
)

# =========================
# Legend INSIDE
# =========================
handles = [line_total, line_patent, line_growth]
labels = [
    "Total activity (papers + patents)",
    "Patent count",
    "Activity growth rate (YoY)",
]

ax1.legend(
    handles, labels,
    loc="upper left",
    frameon=True,
    framealpha=0.9
)

ax1.grid(alpha=0.12)
plt.tight_layout()
plt.show()



In [None]:
# === Cell 4: Data Preparation for area-tech-date level time series (df_ts) in Streamlit ===

g = df.groupby(
    ["auto_focus_area", "auto_tech_cluster", "date", "source_type"]
).size().reset_index(name="n")

pivot = g.pivot_table(
    index=["auto_focus_area", "auto_tech_cluster", "date"],
    columns="source_type",
    values="n",
    fill_value=0
).reset_index()

pivot.columns.name = None

for col in ["paper", "patent"]:
    if col not in pivot.columns:
        pivot[col] = 0

pivot["n_total"] = pivot["paper"] + pivot["patent"]
pivot["share_paper"] = np.where(pivot["n_total"] > 0, pivot["paper"] / pivot["n_total"], 0.0)
pivot["share_patent"] = np.where(pivot["n_total"] > 0, pivot["patent"] / pivot["n_total"], 0.0)

df_ts = pivot.copy()
df_ts.head()


In [None]:
# === Cell 13: save outputs ===

df_ts.to_parquet(DATA_DIR / "area_tech_timeseries.parquet", index=False)

print(DATA_DIR / "area_tech_timeseries.parquet")

