# GPU Patent Data Pipeline v1

This notebook fetches patent data from lens.org API based on GPU-related CPC codes and US jurisdiction.

## Setup
1. Install required packages: `pip install requests pandas pyarrow`
2. Set your lens.org API token in environment variable `LENS_API_TOKEN`

## Pipeline Steps
1. Configure CPC codes and search parameters
2. Fetch raw data from lens.org API
3. Save raw compressed data
4. Parse and normalize schema
5. Clean text fields
6. Generate embeddings (optional)
7. Log all operations

In [None]:
import os
import json
import gzip
import requests
import pandas as pd
from datetime import datetime
from pathlib import Path

## Configuration

In [None]:
# API Configuration
API_TOKEN = os.getenv('LENS_API_TOKEN', 'your-api-token-here')
API_URL = 'https://api.lens.org/patent/search'

# Data paths
BASE_PATH = Path('../data/patents/v1_gpu_kw_cpc')
RAW_PATH = BASE_PATH / 'raw'
PARSED_PATH = BASE_PATH / 'parsed'
TEXT_CLEAN_PATH = BASE_PATH / 'text_clean'
EMBEDDINGS_PATH = BASE_PATH / 'embeddings'
LOGS_PATH = BASE_PATH / 'logs'

# GPU-related CPC codes (examples - adjust based on your research needs)
# G06F = Electric digital data processing
# G06T = Image data processing or generation
# H01L = Semiconductor devices
GPU_CPC_CODES = [
    'G06F3/14',    # Graphics input/output
    'G06T1/20',    # Parallel data processing
    'G06T1/60',    # GPU architecture
    'G09G5/36',    # Graphics processing
]

# Search parameters
JURISDICTION = 'US'  # United States patents only
MAX_RESULTS = 100    # Adjust based on your needs (lens.org has rate limits)

## 1. Fetch Raw Data from lens.org API

In [None]:
def build_query(cpc_codes, jurisdiction):
    """
    Build lens.org API query for GPU patents.
    
    Args:
        cpc_codes: List of CPC classification codes
        jurisdiction: Patent jurisdiction (e.g., 'US')
    
    Returns:
        Query dictionary for lens.org API
    """
    # Build CPC code filter
    cpc_filter = {
        "bool": {
            "should": [
                {"term": {"classification_cpc.classification_id": code}} 
                for code in cpc_codes
            ],
            "minimum_should_match": 1
        }
    }
    
    # Add GPU-related keywords for better filtering
    keyword_filter = {
        "bool": {
            "should": [
                {"match": {"title": "GPU"}},
                {"match": {"title": "graphics processing"}},
                {"match": {"title": "parallel processing"}},
                {"match": {"abstract": "GPU"}},
                {"match": {"abstract": "graphics processing unit"}}
            ],
            "minimum_should_match": 1
        }
    }
    
    query = {
        "query": {
            "bool": {
                "must": [
                    cpc_filter,
                    keyword_filter,
                    {"term": {"jurisdiction": jurisdiction}}
                ]
            }
        },
        "size": MAX_RESULTS,
        "include": [
            "lens_id",
            "title",
            "abstract",
            "description",
            "claims",
            "date_published",
            "jurisdiction",
            "applicants",
            "inventors",
            "classification_cpc",
            "biblio"
        ]
    }
    
    return query

def fetch_patents(api_token, query):
    """
    Fetch patents from lens.org API.
    
    Args:
        api_token: lens.org API token
        query: Query dictionary
    
    Returns:
        API response data
    """
    headers = {
        'Authorization': f'Bearer {api_token}',
        'Content-Type': 'application/json'
    }
    
    response = requests.post(API_URL, json=query, headers=headers)
    response.raise_for_status()
    
    return response.json()

# Build and execute query
query = build_query(GPU_CPC_CODES, JURISDICTION)
print("Query configuration:")
print(json.dumps(query, indent=2))

print("\nFetching patents from lens.org...")
try:
    raw_data = fetch_patents(API_TOKEN, query)
    print(f"✓ Successfully fetched {len(raw_data.get('data', []))} patents")
    print(f"  Total results available: {raw_data.get('total', 0)}")
except Exception as e:
    print(f"✗ Error fetching data: {e}")
    raw_data = None

## 2. Save Raw Compressed Data

In [None]:
if raw_data:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    raw_file = RAW_PATH / f'patents_{timestamp}.json.gz'
    
    # Save compressed raw data
    with gzip.open(raw_file, 'wt', encoding='utf-8') as f:
        json.dump(raw_data, f, indent=2)
    
    print(f"✓ Raw data saved to: {raw_file}")
    print(f"  File size: {raw_file.stat().st_size / 1024:.2f} KB")

## 3. Parse and Normalize Schema

In [None]:
def parse_patents(raw_data):
    """
    Parse raw patent data into normalized schema.
    
    Args:
        raw_data: Raw API response
    
    Returns:
        DataFrame with normalized patent data
    """
    patents = raw_data.get('data', [])
    
    parsed_records = []
    for patent in patents:
        record = {
            'lens_id': patent.get('lens_id'),
            'title': patent.get('title'),
            'abstract': patent.get('abstract'),
            'description': patent.get('description'),
            'date_published': patent.get('date_published'),
            'jurisdiction': patent.get('jurisdiction'),
            'applicants': json.dumps(patent.get('applicants', [])),
            'inventors': json.dumps(patent.get('inventors', [])),
            'cpc_codes': json.dumps([c.get('classification_id') for c in patent.get('classification_cpc', [])]),
            'claims_count': len(patent.get('claims', [])),
            'first_claim': patent.get('claims', [{}])[0].get('claim_text') if patent.get('claims') else None
        }
        parsed_records.append(record)
    
    return pd.DataFrame(parsed_records)

if raw_data:
    df_parsed = parse_patents(raw_data)
    print(f"✓ Parsed {len(df_parsed)} patent records")
    print(f"\nSchema:")
    print(df_parsed.dtypes)
    print(f"\nSample record:")
    print(df_parsed.iloc[0] if len(df_parsed) > 0 else "No records")
    
    # Save as parquet
    parsed_file = PARSED_PATH / f'patents_{timestamp}.parquet'
    df_parsed.to_parquet(parsed_file, index=False)
    print(f"\n✓ Parsed data saved to: {parsed_file}")

## 4. Clean Text Fields

In [None]:
import re

def clean_text(text):
    """
    Clean text field for embedding.
    
    Args:
        text: Raw text
    
    Returns:
        Cleaned text
    """
    if not text or pd.isna(text):
        return ""
    
    # Convert to string if not already
    text = str(text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s,.;:()-]', '', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

if raw_data and not df_parsed.empty:
    # Create cleaned version of key text fields
    df_clean = pd.DataFrame({
        'lens_id': df_parsed['lens_id'],
        'title_clean': df_parsed['title'].apply(clean_text),
        'abstract_clean': df_parsed['abstract'].apply(clean_text),
        'description_clean': df_parsed['description'].apply(clean_text),
        'first_claim_clean': df_parsed['first_claim'].apply(clean_text),
        # Combined text for embedding
        'combined_text': df_parsed.apply(
            lambda row: f"{clean_text(row['title'])} {clean_text(row['abstract'])} {clean_text(row['first_claim'])}",
            axis=1
        )
    })
    
    print(f"✓ Cleaned {len(df_clean)} patent text records")
    print(f"\nSample cleaned record:")
    print(df_clean.iloc[0] if len(df_clean) > 0 else "No records")
    
    # Save cleaned text
    clean_file = TEXT_CLEAN_PATH / f'patents_clean_{timestamp}.parquet'
    df_clean.to_parquet(clean_file, index=False)
    print(f"\n✓ Cleaned text saved to: {clean_file}")

## 5. Generate Embeddings (Optional)

This section demonstrates how to generate embeddings. 
Requires a sentence transformer model (e.g., sentence-transformers library).

```bash
pip install sentence-transformers
```

In [None]:
# Uncomment to generate embeddings
# from sentence_transformers import SentenceTransformer
# import numpy as np

# if raw_data and not df_clean.empty:
#     print("Loading embedding model...")
#     model = SentenceTransformer('all-MiniLM-L6-v2')
#     
#     print("Generating embeddings...")
#     embeddings = model.encode(df_clean['combined_text'].tolist(), show_progress_bar=True)
#     
#     # Save embeddings
#     emb_file = EMBEDDINGS_PATH / f'embeddings_{timestamp}.npy'
#     np.save(emb_file, embeddings)
#     
#     # Save IDs separately
#     ids_file = EMBEDDINGS_PATH / f'ids_{timestamp}.txt'
#     with open(ids_file, 'w') as f:
#         f.write('\n'.join(df_clean['lens_id'].tolist()))
#     
#     print(f"✓ Embeddings saved to: {emb_file}")
#     print(f"  Shape: {embeddings.shape}")
#     print(f"✓ IDs saved to: {ids_file}")

print("Embedding generation is optional. Uncomment the code above to enable.")

## 6. Log Pipeline Execution

In [None]:
if raw_data:
    # Create execution log
    log = {
        'timestamp': timestamp,
        'execution_time': datetime.now().isoformat(),
        'query_spec': query,
        'results': {
            'total_available': raw_data.get('total', 0),
            'fetched': len(raw_data.get('data', [])),
            'parsed': len(df_parsed) if raw_data else 0,
            'cleaned': len(df_clean) if raw_data and not df_parsed.empty else 0
        },
        'files_created': {
            'raw': str(raw_file.name) if raw_data else None,
            'parsed': str(parsed_file.name) if raw_data else None,
            'cleaned': str(clean_file.name) if raw_data and not df_parsed.empty else None
        },
        'cpc_codes': GPU_CPC_CODES,
        'jurisdiction': JURISDICTION
    }
    
    # Save log
    log_file = LOGS_PATH / f'run_{timestamp}.json'
    with open(log_file, 'w') as f:
        json.dump(log, f, indent=2)
    
    print(f"✓ Execution log saved to: {log_file}")
    print(f"\nPipeline Summary:")
    print(f"  Total patents available: {log['results']['total_available']}")
    print(f"  Fetched: {log['results']['fetched']}")
    print(f"  Parsed: {log['results']['parsed']}")
    print(f"  Cleaned: {log['results']['cleaned']}")
else:
    print("⚠ No data fetched. Check your API token and connection.")

## Next Steps

1. **Analyze parsed data**: Load the parquet files and explore patent characteristics
2. **Generate embeddings**: Uncomment section 5 to create vector embeddings
3. **Similarity search**: Use embeddings to find similar patents
4. **Time series analysis**: Analyze patent trends over time
5. **Network analysis**: Explore relationships between applicants, inventors, and technologies