In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
/
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/kpmgpwc/new.json
/kaggle/input/kpmgpwc/classified_kpmg.json
/kaggle/input/kpmgpwc/classified_pwc.json


In [2]:
pip install google-genai

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import json
import time
import re
import google.generativeai as genai
from kaggle_secrets import UserSecretsClient
try:
    from google.api_core.exceptions import ResourceExhausted
except ImportError:
    ResourceExhausted = None 
# --- Configuration (Keep previous config: API Key, Paths, Prompt, Model Config) ---

# 1. Get API Key from Kaggle Secrets
try:
    user_secrets = UserSecretsClient()
    gemini_api_key = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=gemini_api_key)
    print("Gemini API Key loaded successfully.")
except Exception as e:
    print(f"Error loading Gemini API Key from Kaggle Secrets: {e}")
    print("Please ensure you have added your key as a secret named 'GEMINI_API_KEY' and attached it to this notebook.")
    gemini_api_key = None

# 2. Define Input File Paths
input_dir = "/kaggle/input/kpmgpwc" 
pwc_file_path = os.path.join(input_dir, "classified_pwc.json")
kpmg_file_path = os.path.join(input_dir, "classified_kpmg.json")
output_file_path = "/kaggle/working/analysis_results_debug.json" 


# 3. Define the Prompt (Keep the same prompt as before)
prompt = """
Analyze the following article text and extract the following information:

1.  **The main theme of the article (1-2 sentences).** Capture the overarching message or argument.
2.  **The primary topic (a short phrase or sentence).** Identify the core subject matter.
3.  **5-10 significant keywords or key phrases that best represent the article’s content.** These should be terms that quickly convey the main concepts.
4.  **For each keyword or phrase, provide a similarity score (a float between 0.0 and 1.0) indicating how closely it matches the article’s central content.** A score of 1.0 means a perfect match, while 0.0 means no relevance.

Return the output *only* in the following JSON format. Do not include any text before or after the JSON structure:

json
{
  "theme": "string",
  "topic": "string",
  "keywords": [
    {"keyword": "string", "similarity_score": 0.0},
    {"keyword": "string", "similarity_score": 0.0},
    ...
  ]
}

Here is the article text:

"""

# 4. Configure the Gemini Model
generation_config = {
  "temperature": 0.2,
  "top_p": 1,
  "top_k": 1,
  "max_output_tokens": 2048,
  "response_mime_type": "application/json",
}
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
]
model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest",
                              generation_config=generation_config,
                              safety_settings=safety_settings)

# --- Helper Functions  ---

def load_json_data(file_path):
    """Loads JSON data from a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Successfully loaded data from {file_path}")
        return data
    except FileNotFoundError:
        print(f"DEBUG: File not found at {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"DEBUG: Could not decode JSON from {file_path}")
        return None
    except Exception as e:
        print(f"DEBUG: An unexpected error occurred loading {file_path}: {e}")
        return None

# --- UPDATED Analysis Function with Rate Limit Handling & Debug Prints ---

def analyze_article(article_text, article_index, total_articles):
    """
    Sends article text to Gemini for analysis, handles rate limits with retries,
    and returns parsed JSON. Includes debugging prints.
    """
    if not article_text or not isinstance(article_text, str) or len(article_text.strip()) < 50: # Increased min length check slightly
        print(f"DEBUG: Skipping analysis for article {article_index+1} due to empty or short article text.")
        return None

    full_prompt = prompt + article_text
    max_retries = 4 
    base_wait_time = 10 
    attempt = 0
    response = None # Initialize response variable

    while attempt < max_retries:
        attempt += 1
        print(f"    Attempt {attempt}/{max_retries} for article {article_index+1}/{total_articles}...")
        try:
            response = model.generate_content(full_prompt)

            # --- DEBUG: Print raw response text ---
            # print(f"    DEBUG: Raw response text (attempt {attempt}):\n---\n{response.text}\n---")
            # --- END DEBUG ---

            # Check for content blocking *before* trying to parse JSON
            if not response.candidates:
                 print(f"    DEBUG: Response blocked (no candidates) for article {article_index+1} on attempt {attempt}. Likely safety/policy.")
                 if response.prompt_feedback:
                     print(f"    DEBUG: Prompt Feedback: {response.prompt_feedback}")
                 else:
                     print("    DEBUG: No prompt feedback available for blocked response.")
                 return None # Don't retry if blocked

            # Check if response text is empty or too short
            if not response.text or len(response.text.strip()) < 10:
                 print(f"    DEBUG: Empty or very short response text received for article {article_index+1} on attempt {attempt}.")
                 return None # Treat as failure, don't retry

            # Attempt to directly parse the response text as JSON
            analysis_json = json.loads(response.text)
            print(f"    DEBUG: Successfully parsed JSON for article {article_index+1} on attempt {attempt}.")
            return analysis_json # Success! Exit the loop and return

        except json.JSONDecodeError as json_e:
            print(f"    DEBUG: Failed to parse JSON for article {article_index+1} on attempt {attempt}. Error: {json_e}")
            print("    ----- Gemini Response Text -----")
            try:
                 print(response.text if response else "No response object available")
            except Exception as e_print:
                print(f"    (Could not print response text: {e_print})")
            print("    -------------------------------")
            # Don't automatically retry on JSON errors
            return None # Failed to parse

        except Exception as e:
            error_message = str(e)
            print(f"    DEBUG: API call error for article {article_index+1} on attempt {attempt}: {error_message}")

            is_rate_limit_error = ("429" in error_message and "exceeded your current quota" in error_message.lower())
            is_google_resource_exhausted = ResourceExhausted and isinstance(e, ResourceExhausted)

            if is_rate_limit_error or is_google_resource_exhausted:
                print("    -> Rate limit exceeded.")
                # Try to extract retry_delay using regex
                match = re.search(r"retry_delay {\s*seconds: (\d+)\s*}", error_message)
                wait_time = base_wait_time # Default if no match

                if match:
                    delay_seconds = int(match.group(1))
                    wait_time = delay_seconds + 3 # Add a 3-second buffer
                    print(f"    -> API suggests waiting {delay_seconds}s.")
                else:
                    # Use exponential backoff if no specific delay found
                    wait_time = base_wait_time * (2 ** (attempt - 1)) # Exponential backoff
                    print(f"    -> No specific retry_delay found. Using backoff.")

                if attempt >= max_retries:
                     print(f"    -> Max retries ({max_retries}) reached after rate limit. Giving up on article {article_index+1}.")
                     return None # Exit after max retries

                print(f"    -> Waiting for {wait_time}s before retry {attempt+1}...")
                time.sleep(wait_time)
                continue # Go to the next iteration of the while loop for retry

            else:
                # Handle other errors
                print(f"    -> Non-rate-limit error encountered. Not retrying article {article_index+1}.")
                # Check for prompt feedback in case of other blocking issues
                try:
                     if response and response.prompt_feedback:
                        print(f"    DEBUG: Prompt Feedback on other error: {response.prompt_feedback}")
                except AttributeError:
                     print("   DEBUG: No prompt feedback attribute on response object.")
                except Exception as feedback_e:
                     print(f"   DEBUG: Error accessing prompt feedback: {feedback_e}")
                return None # Exit loop for other errors

    # If loop finishes without returning (e.g., max retries reached)
    print(f"    DEBUG: Reached end of analyze_article for article {article_index+1} after {attempt} attempts without success.")
    return None


# --- Main Execution ---

if gemini_api_key: 
    print("DEBUG: Starting data loading...")
    pwc_data = load_json_data(pwc_file_path)
    kpmg_data = load_json_data(kpmg_file_path)

    all_articles = []
    if pwc_data and isinstance(pwc_data, list): 
        all_articles.extend(pwc_data)
        print(f"DEBUG: Extended with {len(pwc_data)} PWC articles.")
    else:
        print(f"DEBUG: PWC data not loaded or not a list.")

    if kpmg_data and isinstance(kpmg_data, list): 
        all_articles.extend(kpmg_data)
        print(f"DEBUG: Extended with {len(kpmg_data)} KPMG articles.")
    else:
        print(f"DEBUG: KPMG data not loaded or not a list.")


    if not all_articles:
        print("DEBUG: No article data loaded or combined list is empty. Exiting.")
    else:
        print(f"DEBUG: Loaded a total of {len(all_articles)} articles for analysis.")
        results = []
        total_articles = len(all_articles)

        for i, article in enumerate(all_articles):
            # Defensive copy to avoid modifying the original list directly if needed elsewhere
            article_copy = article.copy()
            title = article_copy.get('title', f'Article {i+1} (No Title)')
            print(f"\nProcessing article {i+1}/{total_articles}: '{title}'")
            text_to_analyze = article_copy.get('cleaned_text')

            if text_to_analyze:
                analysis_result = analyze_article(text_to_analyze, i, total_articles) 
                if analysis_result:
                    article_copy['analysis'] = analysis_result 
                    print(f"-> Analysis successful for article {i+1}.")
                else:
                    article_copy['analysis'] = None 
                    print(f"-> Analysis failed/skipped for article {i+1}.")
            else:
                print(f"-> Skipping analysis for article {i+1}: No 'cleaned_text' found.")
                article_copy['analysis'] = None

            results.append(article_copy) # Append the processed copy
            print(f"DEBUG: Appended article {i+1}. Current results count: {len(results)}")


        # --- DEBUG: Check results list before saving ---
        print(f"\nDEBUG: Finished processing loop. Total items in results list: {len(results)}")
        if results:
             print(f"DEBUG: First item in results (showing keys): {list(results[0].keys())}")
             if 'analysis' in results[0]:
                  print(f"DEBUG: Analysis field in first item: {results[0]['analysis'] is not None}")
        else:
             print("DEBUG: Results list is empty before saving!")
        # --- END DEBUG ---

        # Save results to a new JSON file
        print(f"\nAttempting to save analysis results to {output_file_path}...")
        try:
            with open(output_file_path, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=4, ensure_ascii=False)
            print("Results saved successfully.")
            # --- DEBUG: Verify file exists and has size ---
            if os.path.exists(output_file_path):
                file_size = os.path.getsize(output_file_path)
                print(f"DEBUG: Output file '{output_file_path}' created with size: {file_size} bytes.")
            else:
                print(f"DEBUG: Output file '{output_file_path}' was NOT created.")
            # --- END DEBUG ---
        except Exception as e:
            print(f"DEBUG: Error saving results to file: {e}")

        print("\nProcessing complete.")
else:
    print("DEBUG: Gemini API Key not configured. Cannot proceed.")

Gemini API Key loaded successfully.
DEBUG: Starting data loading...
Successfully loaded data from /kaggle/input/kpmgpwc/classified_pwc.json
Successfully loaded data from /kaggle/input/kpmgpwc/classified_kpmg.json
DEBUG: Extended with 5 PWC articles.
DEBUG: Extended with 34 KPMG articles.
DEBUG: Loaded a total of 39 articles for analysis.

Processing article 1/39: 'Sustainable packaging in the FMCG and retail sector'
    Attempt 1/4 for article 1/39...
    DEBUG: Successfully parsed JSON for article 1 on attempt 1.
-> Analysis successful for article 1.
DEBUG: Appended article 1. Current results count: 1

Processing article 2/39: 'Electronics Component Manufacturing Scheme'
    Attempt 1/4 for article 2/39...
    DEBUG: Successfully parsed JSON for article 2 on attempt 1.
-> Analysis successful for article 2.
DEBUG: Appended article 2. Current results count: 2

Processing article 3/39: 'Agentic AI in the human capital management (HCM) industry'
    Attempt 1/4 for article 3/39...
    DEB

In [None]:
import os
import json
import time
import re
import google.generativeai as genai
from kaggle_secrets import UserSecretsClient
try:
    from google.api_core.exceptions import ResourceExhausted
except ImportError:
    ResourceExhausted = None

# --- Add clustering imports ---
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm



# --- Configuration ---
# 1. Get API Key

try:
    user_secrets = UserSecretsClient()
    gemini_api_key = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=gemini_api_key)
    print("Gemini API Key loaded successfully.")
except Exception as e:
    print(f"Error loading Gemini API Key from Kaggle Secrets: {e}")
    gemini_api_key = None

# 2. Define Paths 
input_analysis_file = "/kaggle/working/analysis_results_debug.json" 
output_clustered_file = "/kaggle/working/clustered_analysis_results.json" 


# --- Helper Functions ---

def load_json_data(file_path):
    """Loads JSON data from a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Successfully loaded data from {file_path}")
        return data
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred loading {file_path}: {e}")
        return None


def generate_embeddings(texts, model_name="models/embedding-001"):
    """Generates embeddings for a list of texts using Gemini."""
    print(f"Generating embeddings for {len(texts)} texts using {model_name}...")
    embeddings = []
    # Use tqdm for progress indication
    for text in tqdm(texts, desc="Generating Embeddings"):
        try:
            result = genai.embed_content(model=model_name, content=text, task_type="SEMANTIC_SIMILARITY") # or "CLUSTERING"
            embeddings.append(result['embedding'])
            time.sleep(0.1) # Small delay to help manage potential rate limits on embedding endpoint
        except Exception as e:
            print(f"Warning: Could not generate embedding for text chunk: '{text[:50]}...'. Error: {e}. Appending None.")
            embeddings.append(None) 
    print("Embedding generation complete.")
    return embeddings

def find_optimal_k_silhouette(embeddings_array, max_k=15, min_k=2):
    """Finds the optimal number of clusters using the Silhouette Score."""
    if embeddings_array.shape[0] < min_k:
         print(f"Not enough samples ({embeddings_array.shape[0]}) for clustering (min_k={min_k}). Returning k=1.")
         return 1
    if embeddings_array.shape[0] <= max_k: 
        max_k = embeddings_array.shape[0] - 1
        print(f"Adjusted max_k to {max_k} due to low number of samples.")
    if min_k > max_k:
        print(f"min_k ({min_k}) cannot be greater than adjusted max_k ({max_k}). Returning k=1.")
        return 1
    if min_k < 2:
        print("min_k must be at least 2 for silhouette score. Returning k=1.")
        return 1


    print(f"Calculating Silhouette Scores for k={min_k} to {max_k}...")
    silhouette_scores = []
    k_range = range(min_k, max_k + 1)

    for k in tqdm(k_range, desc="Finding Optimal K"):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) 
        cluster_labels = kmeans.fit_predict(embeddings_array)
        # Check if we have more than one cluster and enough samples to calculate silhouette score
            if len(set(cluster_labels)) > 1 and len(cluster_labels) > k:
             score = silhouette_score(embeddings_array, cluster_labels)
             silhouette_scores.append(score)
             print(f"  k={k}, Silhouette Score: {score:.4f}")
        else:
             print(f"  k={k}, Cannot calculate Silhouette Score (likely not enough distinct clusters formed). Appending -1.")
             silhouette_scores.append(-1)

    if not silhouette_scores or max(silhouette_scores) <= -1:
        print("Warning: Could not determine optimal k using Silhouette Score. Defaulting to k=3.")
        return 3 

    optimal_k = k_range[np.argmax(silhouette_scores)]
    print(f"\nOptimal k based on Silhouette Score: {optimal_k}")
    return optimal_k

# --- Main Execution: Clustering Phase ---

if gemini_api_key: 
    print("--- Starting Clustering Phase ---")
    # 1. Load the previously analyzed data
    analyzed_articles = load_json_data(input_analysis_file)

    if not analyzed_articles:
        print("Could not load analyzed articles. Exiting clustering phase.")
    else:
        # 2. Prepare data: Extract themes and keep track of original articles
        themes_to_cluster = []
        article_references = [] # Store index or title/url to link back
        valid_indices = [] # Keep track of the original index of articles with valid themes

        print("Extracting valid themes for clustering...")
        for i, article in enumerate(analyzed_articles):
            analysis = article.get('analysis')
            if analysis and isinstance(analysis, dict) and analysis.get('theme'):
                theme_text = analysis['theme'].strip()
                if len(theme_text) > 10: # Basic check for non-empty theme
                    themes_to_cluster.append(theme_text)
                    # Store the original index and title/url for reference
                    article_references.append({
                        "original_index": i,
                        "title": article.get('title', f"Article_{i}"),
                        "url": article.get('url', 'N/A')
                    })
                    valid_indices.append(i)
                else:
                    print(f"  Skipping article {i+1} (Title: {article.get('title', 'N/A')}) due to short/empty theme.")
            else:
                 print(f"  Skipping article {i+1} (Title: {article.get('title', 'N/A')}) due to missing or invalid analysis/theme.")

        if not themes_to_cluster:
            print("No valid themes found to cluster. Exiting.")
        else:
            print(f"Extracted {len(themes_to_cluster)} valid themes.")

            # 3. Generate Embeddings for the themes
            embeddings = generate_embeddings(themes_to_cluster)

            # Filter out any None embeddings that might have resulted from API errors
            valid_embeddings = [emb for emb in embeddings if emb is not None]
            # Adjust references and themes to match the valid embeddings
            valid_themes = [themes_to_cluster[i] for i, emb in enumerate(embeddings) if emb is not None]
            valid_references = [article_references[i] for i, emb in enumerate(embeddings) if emb is not None]

            if not valid_embeddings:
                print("Error: Embedding generation failed for all themes. Cannot proceed with clustering.")
            else:
                print(f"Successfully generated {len(valid_embeddings)} embeddings.")
                embeddings_array = np.array(valid_embeddings) # Convert to NumPy array for scikit-learn

                # 4. Find the optimal number of clusters (k)
                num_samples = embeddings_array.shape[0]
                potential_max_k = min(15, num_samples -1 if num_samples > 1 else 1) # Cannot have more clusters than samples - 1
                optimal_k = find_optimal_k_silhouette(embeddings_array, max_k=potential_max_k)

                if optimal_k < 2:
                    print("Optimal k is less than 2, clustering is not meaningful. Saving unclustered results.")
                    clustered_results = {"unclustered_themes": []}
                    for i in range(len(valid_themes)):
                        clustered_results["unclustered_themes"].append({
                            "title": valid_references[i]["title"],
                            "url": valid_references[i]["url"],
                            "theme": valid_themes[i],
                        })
                else:
                    # 5. Perform KMeans clustering
                    print(f"\nPerforming KMeans clustering with k={optimal_k}...")
                    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
                    cluster_labels = kmeans.fit_predict(embeddings_array)
                    print("Clustering complete.")

                    # 6. Organize results by cluster
                    print("Organizing results by cluster...")
                    clustered_results = {}
                    for i in range(optimal_k):
                        clustered_results[f"cluster_{i}"] = []

                    for i, label in enumerate(cluster_labels):
                        cluster_name = f"cluster_{label}"
                        clustered_results[cluster_name].append({
                            "title": valid_references[i]["title"],
                            "url": valid_references[i]["url"],
                            "theme": valid_themes[i],
                            
                        })
                    print("Results organized.")


                # 7. Save clustered results
                print(f"\nSaving clustered results to {output_clustered_file}...")
                try:
                    with open(output_clustered_file, 'w', encoding='utf-8') as f:
                        json.dump(clustered_results, f, indent=4, ensure_ascii=False)
                    print("Clustered results saved successfully.")
                    # --- DEBUG: Verify file exists and has size ---
                    if os.path.exists(output_clustered_file):
                        file_size = os.path.getsize(output_clustered_file)
                        print(f"DEBUG: Output file '{output_clustered_file}' created with size: {file_size} bytes.")
                    else:
                        print(f"DEBUG: Output file '{output_clustered_file}' was NOT created.")
                    # --- END DEBUG ---
                except Exception as e:
                    print(f"Error saving clustered results to file: {e}")

else:
    print("Gemini API Key not configured. Cannot proceed with embedding/clustering.")

print("\nClustering script finished.")

Gemini API Key loaded successfully.
--- Starting Clustering Phase ---
Successfully loaded data from /kaggle/working/analysis_results_debug.json
Extracting valid themes for clustering...
Extracted 39 valid themes.
Generating embeddings for 39 texts using models/embedding-001...


Generating Embeddings:   0%|          | 0/39 [00:00<?, ?it/s]

Embedding generation complete.
Successfully generated 39 embeddings.
Calculating Silhouette Scores for k=2 to 15...


Finding Optimal K:   0%|          | 0/14 [00:00<?, ?it/s]

  k=2, Silhouette Score: 0.0671
  k=3, Silhouette Score: 0.0649
  k=4, Silhouette Score: 0.0691
  k=5, Silhouette Score: 0.0399
  k=6, Silhouette Score: 0.0616
  k=7, Silhouette Score: 0.0439
  k=8, Silhouette Score: 0.0692
  k=9, Silhouette Score: 0.0504
  k=10, Silhouette Score: 0.0542
  k=11, Silhouette Score: 0.0504
  k=12, Silhouette Score: 0.0589
  k=13, Silhouette Score: 0.0569
  k=14, Silhouette Score: 0.0571
  k=15, Silhouette Score: 0.0589

Optimal k based on Silhouette Score: 8

Performing KMeans clustering with k=8...
Clustering complete.
Organizing results by cluster...
Results organized.

Saving clustered results to /kaggle/working/clustered_analysis_results.json...
Clustered results saved successfully.
DEBUG: Output file '/kaggle/working/clustered_analysis_results.json' created with size: 19415 bytes.

Clustering script finished.
