In [1]:
import pandas as pd

In [2]:
df_patent=pd.read_csv("../functions/patent_ai_all_2015_2025.csv")
df_patent.head()

Unnamed: 0,doc_id,title,text,date,year,month,source_type,tech_field,trl_true,citation_count
0,US-8966465-B2,[{'text': 'Customization creation and update f...,[{'text': 'Embodiments of the present inventio...,2015-02-24,2015,2,patent,"[{'code': 'G06F8/51', 'inventive': True, 'firs...",,0
1,US-8977584-B2,"[{'text': 'Apparatuses, methods and systems fo...","[{'text': 'The APPARATUSES, METHODS AND SYSTEM...",2015-03-10,2015,3,patent,"[{'code': 'G10L25/27', 'inventive': True, 'fir...",,0
2,US-9001118-B2,[{'text': 'Avatar construction using depth cam...,[{'text': 'A method for constructing an avatar...,2015-04-07,2015,4,patent,"[{'code': 'G06V10/426', 'inventive': True, 'fi...",,0
3,US-9031824-B2,[{'text': 'Real-time predictive systems for in...,[{'text': 'A system for intelligent monitoring...,2015-05-12,2015,5,patent,"[{'code': 'G05B13/048', 'inventive': True, 'fi...",,0
4,US-9039681-B2,[{'text': 'Minimally invasive surgical trainin...,[{'text': 'A medical system that allows a ment...,2015-05-26,2015,5,patent,"[{'code': 'A61B2019/2292', 'inventive': False,...",,0


In [3]:
import pandas as pd
import ast
import re

# 1) CSV'yi oku
df_patent = pd.read_csv("../functions/patent_ai_all_2015_2025.csv")

# -------------------------------
# Yardımcı fonksiyonlar
# -------------------------------

def extract_text_listdict(cell):
    """
    "[{'text': '...'}]" / [{'text': '...'}] / dict / düz string
    -> '...' (tekleştirilmiş text)
    """
    if cell is None or (isinstance(cell, float) and pd.isna(cell)):
        return ""

    # string ise ve list/dict repr'ına benziyorsa parse et
    if isinstance(cell, str) and cell.strip().startswith("[") and "text" in cell:
        try:
            parsed = ast.literal_eval(cell)
        except (SyntaxError, ValueError):
            return cell.strip()
    else:
        parsed = cell

    # list-of-dicts
    if isinstance(parsed, list):
        parts = []
        for item in parsed:
            if isinstance(item, dict) and "text" in item:
                parts.append(str(item["text"]))
            else:
                parts.append(str(item))
        return " ".join(p for p in parts if p).strip()

    # tek dict
    if isinstance(parsed, dict) and "text" in parsed:
        return str(parsed["text"]).strip()

    # fallback
    return str(parsed).strip()




def extract_main_cpc(cell):
    """
    \"[{'code': 'G06F8/51', 'inventive': True, ...}, ...]\"
    türünden stringten ilk 'code' değerini çeker.
    """
    if cell is None or (isinstance(cell, float) and pd.isna(cell)):
        return None

    if not isinstance(cell, str):
        cell = str(cell)

    # 'code': 'G06F8/51' kısmını yakala
    m = re.search(r"'code'\s*:\s*'([^']+)'", cell)
    if m:
        return m.group(1).strip()

    # olmadıysa None dön
    return None


# -------------------------------
# 2) Temiz kolonları üret
# -------------------------------

title_clean = df_patent["title"].apply(extract_text_listdict)
abstract_clean = df_patent["text"].apply(extract_text_listdict)
tech_main = df_patent["tech_field"].apply(extract_main_cpc)
tech_main = tech_main.where(tech_main.notna() & tech_main.ne(""), None)


# NaN korunsun, boş stringler None olsun
title_clean = title_clean.where(title_clean.ne(""), None)
abstract_clean = abstract_clean.where(abstract_clean.ne(""), None)
tech_main = tech_main.where(tech_main.ne(""), None)

# detailed_text = abstract + " " + title
detailed_text = (
    abstract_clean.fillna("") + " " + title_clean.fillna("")
).str.strip()
detailed_text = detailed_text.where(detailed_text.ne(""), None)

# -------------------------------
# 3) df_patent_corpus'u kur
# -------------------------------

df_patent_corpus = pd.DataFrame()
df_patent_corpus["title"] = title_clean
df_patent_corpus["abstract"] = abstract_clean
df_patent_corpus["detailed_text"] = detailed_text
df_patent_corpus["year"] = df_patent["year"].astype("Int64")
df_patent_corpus["month"] = df_patent["month"].astype("Int64")
df_patent_corpus["source_type"] = "patent"
df_patent_corpus["tech_field"] = tech_main
df_patent_corpus["trl"] = df_patent["trl_true"]

df_patent_corpus.head()


Unnamed: 0,title,abstract,detailed_text,year,month,source_type,tech_field,trl
0,Customization creation and update for multi-la...,Embodiments of the present invention provide t...,Embodiments of the present invention provide t...,2015,2,patent,G06F8/51,
1,"Apparatuses, methods and systems for a digital...","The APPARATUSES, METHODS AND SYSTEMS FOR A DIG...","The APPARATUSES, METHODS AND SYSTEMS FOR A DIG...",2015,3,patent,G10L25/27,
2,Avatar construction using depth camera,A method for constructing an avatar of a human...,A method for constructing an avatar of a human...,2015,4,patent,G06V10/426,
3,Real-time predictive systems for intelligent e...,A system for intelligent monitoring and manage...,A system for intelligent monitoring and manage...,2015,5,patent,G05B13/048,
4,Minimally invasive surgical training using rob...,A medical system that allows a mentor to teach...,A medical system that allows a mentor to teach...,2015,5,patent,A61B2019/2292,


In [4]:
out_path = "../data/patent_ai_corpus_2015_2025.csv"
df_patent_corpus.to_csv(out_path, index=False)