In [None]:
import pandas as pd
import numpy as np
import os
import json
import re
import sys
from typing import Any
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from typing import Any


repo_root = os.path.dirname(os.path.abspath(os.getcwd()))
print(repo_root)
if repo_root not in sys.path:
    sys.path.append(repo_root)

from scripts.news_llm import Factory
from vertexai.generative_models import GenerativeModel

In [None]:
def parse_llm_response(response):
    try:
        result = response.text
        # Clean up the response if needed (remove markdown code blocks if present)
        json_match = re.search(r"```json\s*(\{.*?\})\s*```", result, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)
            return json.loads(json_str)
        if result.startswith("```json"):
            result = result.strip("```json").strip("```").strip()  # noqa: B005
            return json.loads(result)
        raise ValueError("No valid JSON block found in the response.")
    except json.JSONDecodeError as e:
        raise ValueError(
            f"Failed to parse model response: {e!s}\nResponse: {response.text}"
        )

In [None]:
def get_response(
    data: str,
    keys: list[str] = ["sectors", "stocks"],
    model: str = "gemini-1.5-pro-002",
    config: dict = {"temperature": 0.01, "top_k": 1, "max_output_tokens": 1000},
):
    # Initialize processor and create the prompt
    factory = Factory()

    # Assemble the prompt
    prompt = factory.assemble_prompt(keys)

    # Get image parts. do we need to encode text to base64?
    parts = [data]

    # Initiate model
    model = GenerativeModel(model)

    # Invoke model
    response = model.generate_content(
        contents=[prompt, *parts], generation_config=config
    )

    print(response.text)

    # parse json response
    json_result = parse_llm_response(response)

    return json_result

In [None]:
def process_article(row: pd.Series, keys: list[str], model: str, config: dict) -> dict[str, Any]:
    content = row["Content"]
    date = row["date"]
    
    # Call the LLM function to extract data
    try:
        result = get_response(data=content, keys=keys, model=model, config=config)
        return {
            "date": date,
            "sectors": result.get("sectors", []),
            "stocks": result.get("stocks", [])
        }
    except Exception as e:
        print(f"Error processing article on {date}: {e}")
        return {
            "date": date,
            "sectors": [],
            "stocks": []
        }

In [5]:
def process_dataset(df: pd.DataFrame) -> pd.DataFrame:
    keys = ["sectors", "stocks"]
    model = "gemini-1.5-pro-002"
    config = {"temperature": 0.01, "top_k": 1, "max_output_tokens": 1000}
    
    # Process each row and collect results
    results = df.apply(lambda row: process_article(row, keys, model, config), axis=1)
    
    # Flatten results into a DataFrame
    processed_data = pd.DataFrame(results.tolist())
    
    # Expand sectors and stocks into separate rows for time series analysis
    expanded_data = processed_data.explode("sectors").explode("stocks")
    return expanded_data


In [None]:
def process_article_parallel(row: pd.Series, keys: list[str], model: str, config: dict) -> dict[str, Any]:
    content = row["Content"]
    date = row["date"]
    
    try:
        result = get_json(data=content, keys=keys, model=model, config=config)
        return {
            "date": date,
            "sectors": result.get("sectors", []),
            "stocks": result.get("stocks", [])
        }
    except Exception as e:
        print(f"Error processing article on {date}: {e}")
        return {
            "date": date,
            "sectors": [],
            "stocks": []
        }

In [None]:
def process_dataset_parallel(df: pd.DataFrame) -> list[dict[str, Any]]:
    keys = ["sectors", "stocks"]
    model = "gemini-1.5-pro-002"
    config = {"temperature": 0.01, "top_k": 1, "max_output_tokens": 1000}
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        # Submit tasks for each row
        futures = [executor.submit(process_article_parallel, row, keys, model, config) for _, row in df.iterrows()]
        
        # Collect results as they complete
        results = [future.result() for future in futures]
    
    return results

In [None]:
def generate_time_series_tables(df: pd.DataFrame):
    # Process the sectors table
    sectors_table = df[["date", "sectors"]].copy()
    sectors_table["sector_name"] = sectors_table["sectors"].apply(lambda x: x.get("sector_name") if isinstance(x, dict) else None)
    sectors_table["sentiment_score"] = sectors_table["sectors"].apply(lambda x: x.get("sentiment_score") if isinstance(x, dict) else None)
    sectors_table = sectors_table.drop(columns=["sectors"])  # Drop the original sectors column
    sectors_table = sectors_table.dropna(subset=["sector_name", "sentiment_score"])  # Drop rows with NaN values
    sectors_table = sectors_table.rename(columns={"sector_name": "sector"})  # Rename column for clarity

    # Process the stocks table
    stocks_table = df[["date", "stocks"]].copy()
    stocks_table["stock_id"] = stocks_table["stocks"].apply(lambda x: x.get("stock_id") if isinstance(x, dict) else None)
    stocks_table["sentiment_score"] = stocks_table["stocks"].apply(lambda x: x.get("sentiment_score") if isinstance(x, dict) else None)
    stocks_table = stocks_table.drop(columns=["stocks"])  # Drop the original stocks column
    stocks_table = stocks_table.dropna(subset=["stock_id", "sentiment_score"])  # Drop rows with NaN values
    stocks_table = stocks_table.rename(columns={"stock_id": "stock"})  # Rename column for clarity

    return sectors_table, stocks_table

In [None]:
df = pd.read_csv("FinSen_US_Categorized_Timestamp.csv")
df.rename(columns={"Time": "date"}, inplace=True)

results = process_dataset_parallel(df)

processed_data = pd.DataFrame(results)

expanded_data = processed_data.explode("sectors").explode("stocks")

sectors_table, stocks_table = generate_time_series_tables(expanded_data)

```json
{
  "sectors": [],
  "stocks": []
}
```

```json
{
  "sectors": [],
  "stocks": []
}
```

```json
{
  "sectors": [
    {
      "sector_name": "communication",
      "sentiment_score": -1
    }
  ],
  "stocks": [
    {
      "stock_id": null,
      "sentiment_score": -1
    }
  ]
}
```
```json
{
  "sectors": [
    {
      "sector_name": "financials",
      "sentiment_score": 0
    }
  ],
  "stocks": []
}
```

```json
{
  "sectors": [
    {
      "sector_name": "financials",
      "sentiment_score": 0
    }
  ],
  "stocks": []
}
```
```json
{
  "sectors": [
    {
      "sector_name": "information_technology",
      "sentiment_score": 1
    }
  ],
  "stocks": [
    {
      "stock_id": "MSFT",
      "sentiment_score": 1
    }
  ]
}
```
```json
{
  "sectors": [
    {
      "sector_name": "information_technology",
      "sentiment_score": -1
    }
  ],
  "stocks": [
    {
      "stock_id": "CSCO",
      "sentiment_score": -1
    }
  ]
}
```
```json
{
  "sectors": [
    {
      "secto

In [15]:
results

[{'date': '16/07/2023',
  'sectors': [{'sector_name': 'energy', 'sentiment_score': -1},
   {'sector_name': 'information_technology', 'sentiment_score': -1},
   {'sector_name': 'financials', 'sentiment_score': 1}],
  'stocks': [{'stock_id': None, 'sentiment_score': None}]},
 {'date': '15/07/2023',
  'sectors': [{'sector_name': 'health_care', 'sentiment_score': 1}],
  'stocks': [{'stock_id': 'UNH', 'sentiment_score': 1}]},
 {'date': '15/07/2023',
  'sectors': [{'sector_name': 'information_technology',
    'sentiment_score': -1}],
  'stocks': [{'stock_id': 'CSCO', 'sentiment_score': -1}]},
 {'date': '15/07/2023',
  'sectors': [{'sector_name': 'communication', 'sentiment_score': -1}],
  'stocks': [{'stock_id': None, 'sentiment_score': -1}]},
 {'date': '15/07/2023',
  'sectors': [{'sector_name': 'information_technology', 'sentiment_score': 1}],
  'stocks': [{'stock_id': 'MSFT', 'sentiment_score': 1}]},
 {'date': '15/07/2023',
  'sectors': [{'sector_name': 'financials', 'sentiment_score': 1}

In [None]:
# Display the first few rows of each table
print("Sectors Table:")
print(sectors_table.head())

print("\nStocks Table:")
print(stocks_table.head())

Sectors Table:
         date                  sector  sentiment_score
0  16/07/2023                  energy             -1.0
0  16/07/2023  information_technology             -1.0
0  16/07/2023              financials              1.0
1  15/07/2023             health_care              1.0
2  15/07/2023  information_technology             -1.0

Stocks Table:
         date stock  sentiment_score
1  15/07/2023   UNH              1.0
2  15/07/2023  CSCO             -1.0
4  15/07/2023  MSFT              1.0
5  15/07/2023   JPM              1.0
7  15/07/2023     C              1.0


In [None]:
# save results

sectors_table.to_csv("sectors_global_sentiment_scores.csv", index=False)
stocks_table.to_csv("stocks_sentiment_scores.csv", index=False)