In [1]:
pip install torch transformers sentence-transformers keybert scikit-learn nltk


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
import re
import os
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

def clean_and_normalize(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # Remove special characters and extra whitespace
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    # Lowercase
    text = text.lower()
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    cleaned_words = [
        lemmatizer.lemmatize(word)
        for word in words
        if word not in stop_words
    ]
    return ' '.join(cleaned_words)

def process_articles(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        articles = json.load(f)
    cleaned_articles = []
    for article in articles:
        # Merge content and pdf_content if available
        content = article.get('content', '')
        pdf_content = article.get('pdf_content', '')
        merged_text = content + ' ' + pdf_content if pdf_content else content
        cleaned_text = clean_and_normalize(merged_text)
        # Preserve all original metadata except content/pdf_content, add cleaned_text
        cleaned_article = {
            k: v for k, v in article.items()
            if k not in ['content', 'pdf_content']
        }
        cleaned_article['cleaned_text'] = cleaned_text
        cleaned_articles.append(cleaned_article)
    return cleaned_articles

# 1. Process and clean articles
cleaned_kpmg = process_articles('kpmg_articles.json')
cleaned_pwc = process_articles('pwc_articles.json')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Judge\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Judge\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Industry list with keywords
INDUSTRY_KEYWORDS = {
    "Artificial Intelligence": ["ai", "artificial intelligence", "machine learning", "deep learning", "neural network", "computer vision"],
    "Supply Chain": ["supply chain", "logistics", "inventory", "procurement", "distribution network"],
    "Energy and Renewables": ["energy", "renewable", "solar", "wind power", "hydroelectric", "carbon footprint"],
    "Cyber Security": ["cyber security", "ransomware", "phishing", "data breach", "encryption", "firewall"],
    "Economy and Growth": ["gdp", "economic growth", "inflation", "market trends", "fiscal policy", "monetary policy"],
    "ESG": ["esg", "environmental social governance", "sustainability", "carbon neutral", "social responsibility"],
    "Technology": ["technology", "digital transformation", "cloud computing", "iot", "blockchain", "5g"],
    "Risk and Regulation": ["risk management", "compliance", "regulation", "legal framework", "audit"],
    "Workforce": ["workforce", "employee engagement", "talent acquisition", "skills gap", "remote work"],
    "Transformation": ["digital transformation", "business transformation", "change management", "operational excellence"],
    "Global Capability Centres": ["gcc", "global capability centres", "shared services", "offshoring", "captive centers"],
    "Assurance": ["assurance", "quality control", "audit", "compliance", "risk assessment"],
    "Trade and Tariffs": ["trade", "tariff", "import export", "customs", "free trade agreement"],
    "Financial Services": ["finance", "banking", "insurance", "investment", "asset management", "fintech"],
    "Healthcare": ["healthcare", "pharmaceutical", "medical devices", "patient care", "telemedicine"],
    "Manufacturing": ["manufacturing", "industry 4.0", "smart factory", "production line", "lean manufacturing"]
}

def classify_industry(text, threshold=2):
    """
    Classify text into industry based on keyword counts
    Returns the industry with highest keyword matches
    """
    text = text.lower()
    scores = {industry: 0 for industry in INDUSTRY_KEYWORDS}
    
    for industry, keywords in INDUSTRY_KEYWORDS.items():
        for keyword in keywords:
            if keyword in text:
                scores[industry] += 1
    
    # Get industry with maximum score
    max_industry = max(scores, key=scores.get)
    
    # Only return if meets minimum threshold, else 'Other'
    return max_industry if scores[max_industry] >= threshold else "Other"

def add_industry_classification(articles):
    for article in articles:
        article['industry'] = classify_industry(article['cleaned_text'])
    return articles

# 2. Add industry classification
classified_kpmg = add_industry_classification(cleaned_kpmg)
classified_pwc = add_industry_classification(cleaned_pwc)

# 3. Save results
with open('classified_kpmg.json', 'w', encoding='utf-8') as f:
    json.dump(classified_kpmg, f, indent=2)
with open('classified_pwc.json', 'w', encoding='utf-8') as f:
    json.dump(classified_pwc, f, indent=2)

In [14]:
pip install google-generativeai

Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.24.2-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.167.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.39.0-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting go

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.16.2 requires keras>=3.0.0, which is not installed.
tensorflow-intel 2.16.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.4 which is incompatible.


In [4]:
import os
import json
import google.generativeai as genai

# Configure Gemini API key (set your environment variable GEMINI_API_KEY before running)
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("Please set your GEMINI_API_KEY environment variable.")
genai.configure(api_key=api_key)

# Initialize Gemini 2.5 Flash model
model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-04-17")

def build_prompt(article):
    """
    Builds the prompt for Gemini 2.5 Flash to extract themes and keywords
    following the LLM-TAKE framework.
    """
    return f"""
You are an expert analyst using the LLM-TAKE framework.

Given the following article text, perform the following:

1. Break the text into semantic sections if needed.
2. Extract 3–7 main themes or topics as concise phrases (not single words).
3. List 5–10 significant keywords (single words or short phrases).
4. For each theme, provide a supporting quote or sentence from the article.
5. Avoid generic or irrelevant keywords (e.g., "article", "information", "read").
6. Remove any sensitive or irrelevant terms.
7. Ensure diversity and coverage of the article’s content.

Return the result in this JSON format:
{{
  "title": "{article['title']}",
  "themes": [
    {{"theme": "...", "evidence": "..."}}
  ],
  "keywords": ["...", "...", "..."]
}}

Article text:
{article['cleaned_text']}
"""

def extract_themes_keywords(article):
    """
    Calls Gemini 2.5 Flash API with the prompt and parses the JSON response.
    Retries once if JSON parsing fails.
    """
    prompt = build_prompt(article)
    try:
        response = model.generate_content(
            prompt,
            generation_config={"temperature": 0.3, "max_output_tokens": 512}
        )
        result_text = response.text.strip()
        # Attempt to parse JSON from response
        return json.loads(result_text)
    except json.JSONDecodeError:
        # Retry once with a simpler prompt or return minimal fallback
        print(f"Warning: JSON parsing failed for article '{article['title']}'. Returning minimal output.")
        return {
            "title": article.get("title", ""),
            "themes": [],
            "keywords": []
        }

def main():
    # Load classified articles from previous step
    with open("classified_kpmg.json", "r", encoding="utf-8") as f:
        kpmg_articles = json.load(f)
    with open("classified_pwc.json", "r", encoding="utf-8") as f:
        pwc_articles = json.load(f)

    all_articles = kpmg_articles + pwc_articles
    themed_articles = []

    print(f"Processing {len(all_articles)} articles for theme and keyword extraction...")

    for idx, article in enumerate(all_articles, 1):
        print(f"[{idx}/{len(all_articles)}] Processing article: {article.get('title', 'No Title')}")
        themed_article = extract_themes_keywords(article)
        themed_articles.append(themed_article)

    # Save output JSON
    with open("themed_articles.json", "w", encoding="utf-8") as f:
        json.dump(themed_articles, f, indent=2)

    print("Theme and keyword extraction completed. Results saved to themed_articles.json")

if __name__ == "__main__":
    main()


ValueError: Please set your GEMINI_API_KEY environment variable.

In [None]:
setx GEMINI_API_KEY "AIzaSyDDgxbSk3pLA-BjvN5Xv0IIgImzWi6aXvE"
