Source: https://claude.ai/chat/ea276ca5-a568-457f-a70b-94febb7e4b6c

In [12]:
import pandas as pd
import numpy as np
from anthropic import Anthropic
from dotenv import load_dotenv
import os
import io
import base64

class ClaudeAnalysis:
    def __init__(self, model="claude-3-sonnet-20240229"):
        """
        Initialize the ClaudeAnalysis class.
        Args:
            model (str): The Claude model to use. Options:
                        - "claude-3-haiku-20240307" (fastest)
                        - "claude-3-sonnet-20240229" (balanced)
                        - "claude-3-opus-20240229" (most capable)
        """
        load_dotenv()
        self.client = Anthropic(api_key=os.environ.get("ANTHROPIC_KEY"))
        self.model = model
    
    def generate_sql_query(self, description):
        """Generates SQL queries from natural language descriptions."""
        message = self.client.messages.create(
            model=self.model,
            max_tokens=1000,
            system="You are a SQL expert. Generate SQL queries based on natural language descriptions. Include comments explaining the query.",
            messages=[
                {
                    "role": "user",
                    "content": f"Generate a SQL query for: {description}"
                }
            ]
        )
        return message.content[0].text
    
    def explain_code(self, code):
        """Explains complex data analysis code."""
        message = self.client.messages.create(
            model=self.model,
            max_tokens=1000,
            system="You are a Python data analysis expert. Explain code in detail, including best practices and potential improvements.",
            messages=[
                {
                    "role": "user",
                    "content": f"Explain this code:\n{code}"
                }
            ]
        )
        return message.content[0].text
    
    def suggest_visualizations(self, df_info):
        """Suggests appropriate visualizations based on data characteristics."""
        message = self.client.messages.create(
            model=self.model,
            max_tokens=1000,
            system="You are a data visualization expert. Suggest appropriate charts and plots based on data types and analysis goals.",
            messages=[
                {
                    "role": "user",
                    "content": f"Suggest visualizations for this dataset:\n{df_info}"
                }
            ]
        )
        return message.content[0].text
    
    def generate_data_cleaning_code(self, df_head, issues):
        """Generates code for data cleaning based on identified issues."""
        message = self.client.messages.create(
            model=self.model,
            max_tokens=1000,
            system="You are a data cleaning expert. Generate Python code for data cleaning, including error handling and validation.",
            messages=[
                {
                    "role": "user",
                    "content": f"Generate code to clean this data with these issues:\nData:\n{df_head}\nIssues:\n{issues}"
                }
            ]
        )
        return message.content[0].text
    
    def interpret_statistical_results(self, results):
        """Interprets statistical analysis results in plain language."""
        message = self.client.messages.create(
            model=self.model,
            max_tokens=1000,
            system="You are a statistics expert. Interpret statistical results in plain language, highlighting key findings and implications.",
            messages=[
                {
                    "role": "user",
                    "content": f"Interpret these statistical results:\n{results}"
                }
            ]
        )
        return message.content[0].text
    
    def generate_eda_code(self, df_info):
        """Generates exploratory data analysis code."""
        message = self.client.messages.create(
            model=self.model,
            max_tokens=1000,
            system="You are a Python data analysis expert. Generate comprehensive Python code for exploratory data analysis.",
            messages=[
                {
                    "role": "user",
                    "content": f"Generate EDA code for this dataset:\n{df_info}"
                }
            ]
        )
        return message.content[0].text
    
    def analyze_visualization(self, plt_figure, specific_questions=None):
        """Analyzes a visualization and provides insights."""
        # Save plot to buffer
        buffer = io.BytesIO()
        plt_figure.savefig(buffer, format='png')
        buffer.seek(0)
        
        # Convert to base64
        image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
        
        # Prepare the prompt
        if specific_questions is None:
            specific_questions = "What are the main insights from this visualization? Include specific numbers and patterns."
            
        message = self.client.messages.create(
            model=self.model,
            max_tokens=1000,
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": specific_questions
                    },
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": image_base64
                        }
                    }
                ]
            }]
        )
        
        # Clean up
        buffer.close()
        
        return message.content[0].text

In [13]:
# Example usage
# Initialize the analyzer
analyzer = ClaudeAnalysis()

In [7]:
# Example 1: Generate SQL Query
sql_description = "Find the average sales by product category for the last quarter"
sql_query = analyzer.generate_sql_query(sql_description)
print("\nGenerated SQL Query:")
print(sql_query)


Generated SQL Query:
-- Find the average sales by product category for the last quarter
SELECT 
    pc.CategoryName,
    AVG(od.Quantity * od.UnitPrice) AS AvgSales
FROM
    OrderDetails od
    JOIN Orders o ON od.OrderID = o.OrderID
    JOIN Products p ON od.ProductID = p.ProductID
    JOIN ProductCategories pc ON p.CategoryID = pc.CategoryID
WHERE
    o.OrderDate >= DATE_SUB(NOW(), INTERVAL 3 MONTH) -- Orders from the last quarter
GROUP BY
    pc.CategoryName;

-- Explanation:
-- 1. The query joins multiple tables (OrderDetails, Orders, Products, and ProductCategories) to access the required data.
-- 2. The WHERE clause filters orders from the last quarter using the DATE_SUB function.
-- 3. The AVG aggregate function calculates the average sales (Quantity * UnitPrice) for each product category.
-- 4. The GROUP BY clause groups the results by product category name.


In [14]:
# Example 2: Get visualization suggestions
df_info = """
columns:
- sales (float64)
- date (datetime64)
- category (object)
- customer_id (int64)
- satisfaction_score (int64)
"""
viz_suggestions = analyzer.suggest_visualizations(df_info)
print("\nVisualization Suggestions:")
print(viz_suggestions)


Visualization Suggestions:
Based on the provided dataset, here are some appropriate data visualization suggestions:

1. **Line Chart or Area Chart**: To visualize the trend of sales over time, a line chart or an area chart could be used, with the x-axis representing the date and the y-axis representing the sales.

2. **Bar Chart or Column Chart**: To compare sales across different categories, a bar chart or a column chart could be used, with the x-axis representing the categories and the y-axis representing the sales.

3. **Scatter Plot**: To analyze the relationship between sales and satisfaction_score, a scatter plot could be used, with the x-axis representing the satisfaction_score and the y-axis representing the sales. You could also color-code or use different markers to distinguish different categories.

4. **Box Plot or Violin Plot**: To examine the distribution of sales or satisfaction_score across different categories, a box plot or a violin plot could be used, with the x-axi

In [15]:
# Example 3: Generate data cleaning code
sample_data = """
   age  income education missing_values
0  25   50000  Bachelor         NaN
1  NaN  60000  Master          Yes
2  35   NaN    PhD             No
"""
issues = "Contains missing values in age and income columns, needs to handle NaN values"
cleaning_code = analyzer.generate_data_cleaning_code(sample_data, issues)
print("\nData Cleaning Code:")
print(cleaning_code)


Data Cleaning Code:
Here's some Python code to clean the given data with error handling and validation:

```python
import pandas as pd

# Load the data
data = pd.DataFrame({
    'age': [25, None, 35],
    'income': [50000, 60000, None],
    'education': ['Bachelor', 'Master', 'PhD'],
    'missing_values': ['NaN', 'Yes', 'No']
})

# Function to handle missing values
def handle_missing_values(df):
    # Drop rows with missing values in 'age' and 'income' columns
    df.dropna(subset=['age', 'income'], inplace=True)
    
    # Fill NaN values in 'missing_values' column with 'Unknown'
    df['missing_values'].fillna('Unknown', inplace=True)
    
    return df

# Function to validate age and income values
def validate_values(df):
    # Check for valid age range
    age_mask = (df['age'] >= 18) & (df['age'] <= 100)
    if not age_mask.all():
    
    # Check for valid income range
    income_mask = (df['income'] >= 0)
    if not income_mask.all():
    
    return df

# Clean the data
cleane

In [6]:
# Example 4: Process PDFs with Claude
# Documentation: https://docs.anthropic.com/en/docs/build-with-claude/pdf-support
import anthropic
from dotenv import load_dotenv
import os
load_dotenv()

client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_KEY"))
message = client.messages.create(
    model="claude-3-7-sonnet-20250219",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "document",
                    "source": {
                        "type": "url",
                        "url": "https://assets.anthropic.com/m/1cd9d098ac3e6467/original/Claude-3-Model-Card-October-Addendum.pdf"
                    }
                },
                {
                    "type": "text",
                    "text": "What are the key findings in this document?"
                }
            ]
        }
    ],
)

print(message.content)

RateLimitError: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (c9f8a04e-97ae-403e-92cc-3ca67ff25e6e) of 20,000 input tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}}

We seem to get an error related to the api token limit if we use the pdf from the official documentation (which is about Claude models paper), so we will try to use simplified document in local.

In [10]:
# Example 4: Process PDFs with Claude
# Documentation: https://docs.anthropic.com/en/docs/build-with-claude/pdf-support
import anthropic
from dotenv import load_dotenv
import os
import base64

load_dotenv()

# Function to read and encode PDF file
def read_pdf_as_base64(file_path):
    with open(file_path, "rb") as pdf_file:
        return base64.b64encode(pdf_file.read()).decode("utf-8")

# Path to your local PDF file
pdf_path = "Statistik Penduduk dan Ketenagakerjaan Kota Palu 2025.pdf"
pdf_base64 = read_pdf_as_base64(pdf_path)

client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_KEY"))
message = client.messages.create(
    model="claude-3-7-sonnet-20250219",  # Using a valid model name
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "document",
                    "source": {
                        "type": "base64",
                        "media_type": "application/pdf",
                        "data": pdf_base64
                    }
                },
                {
                    "type": "text",
                    "text": "What are the key findings in this document?"
                }
            ]
        }
    ],
)

print(message.content[0].text)

Based on the document, here are the key findings about population and employment in Palu Municipality:

**Population Data:**
- The total population of Palu City is 389,959 people (from Table 3.1.1)
- Mantikulore District has the highest percentage of population at 21.03%
- The population density is 993.54 people per km² (or about 994 people per km²)
- The sex ratio is 100.44, indicating a slightly higher male population
- The annual population growth rate for Palu City is 0.88% (2020-2024)

**Employment Data:**
- From the 2024 National Labor Force Survey, the working age population (15+ years) is divided into:
  - Labor force: 203,852 people
  - Non-labor force: 85,285 people
- Of the labor force:
  - Employed: 192,383 people
  - Unemployed: 11,469 people
- Employment status breakdown shows most people (100,258) work as employees/staff, followed by self-employed workers (43,273)
- The Labor Force Participation Rate is 70.50%
- Most workers (147,850) work 35+ hours per week

The documen

> The page of BPS data for population and employment data is about 19 pages and contains mostly images and tables, so the model need to extract the data from there, though it probably treat it as a normal document reading since we are using the "type": "document" (?) haven't figure it out completely, but the most important thing to note is that, when I run this, it cost me $0.13 or around IDR 2.145.

> And also, I validated the summary output result from the model, and it is suprisingly very accurate and not just able to perform lookup functions but also doing calculations on its own based on the data.

> For further improvement, if we want to use this, then we need to implement several additional components using langchain most importantly so that we can store the document content inside vector database and do RAG.

In [None]:
# Example 5: Web search capabilities
# Documentation: https://docs.anthropic.com/en/docs/build-with-claude/tool-use/web-search-tool#localization
import anthropic
import os
from dotenv import load_dotenv
from pprint import pprint

load_dotenv()

client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_KEY"))

response = client.messages.create(
    model="claude-3-7-sonnet-latest",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": "Siapa presiden terpilih Amerika Serikat pada tahun 2024?"
        }
    ],
    tools=[{
        "type": "web_search_20250305",
        "name": "web_search",
        "max_uses": 3 #  // Optional: Limit the number of searches per request
    }]
)
pprint(response.content)

[TextBlock(citations=None, text='Saya perlu mencari informasi terkini tentang presiden terpilih Amerika Serikat pada tahun 2024. Biar saya cari informasinya untuk Anda.', type='text'),
 TextBlock(citations=None, text=None, type='server_tool_use', id='srvtoolu_0118PYBfqM7YR7qwsBEp2PeB', name='web_search', input={'query': 'who won US presidential election 2024 president elect'}),
 TextBlock(citations=None, text=None, type='web_search_tool_result', tool_use_id='srvtoolu_0118PYBfqM7YR7qwsBEp2PeB', content=[{'type': 'web_search_result', 'title': 'Presidential election results 2024 | CNN Politics', 'url': 'https://www.cnn.com/election/2024/results/president', 'encrypted_content': 'Eu0JCioIAxgCIiRjOWY4YTA0ZS05N2FlLTQwM2UtOTJjYy0zY2E2N2ZmMjVlNmUSDECEREPnBdHuyLxjUxoMrwzK5hF2q5Dtwp9QIjDxwn8m3KiZ52qrMyqR9oX2mxnMvPXOz+DFBioAtIQA+QC4l3mJN2rr+5G3tkpZBoEq8AjWG2s064/9LCLRiAWn/nyBDWRvIhHjZzqI+CQqNlTuErojWLxnpGDY/sYWecemLG7MXWAiV7n+8BBEEKk1LUrYO5UCBTSIKI/0nubZuPQIaQusP/bw7kVEzqNY/9e/4/JyZTK+fbFCXy6vT8xz

We can see the model response quite accurate here with several websites actually searched even though the max_uses is only set to 3.