In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np

class CustomSentimentClassifierWithWeights(nn.Module):
    def __init__(self, base_model, class_weights, num_labels=3, dropout_rate=0.1):
        super().__init__()
        self.num_labels = num_labels
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(768, num_labels)
        self.loss_fct = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]
        dropped_output = self.dropout(cls_output)
        logits = self.classifier(dropped_output)

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

def load_model_and_tokenizer(model_path="sentiment_model/article_sentiment_analysis.pth", base_checkpoint="distilbert-base-uncased"):
    """
    Loads the trained model, tokenizer, and sets up the device.
    """
    print("Loading sentiment model and tokenizer...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    tokenizer = AutoTokenizer.from_pretrained(base_checkpoint)

    base_model = AutoModel.from_pretrained(base_checkpoint).to(device)

    dummy_weights = torch.ones(3).to(device) 
    
    model = CustomSentimentClassifierWithWeights(
        base_model=base_model,
        class_weights=dummy_weights,
        num_labels=3
    )
    try:
        model.load_state_dict(torch.load(model_path, map_location=device))
    except FileNotFoundError:
        print(f"ERROR: Model file not found at {model_path}")
        print("Please make sure the file is in the same directory.")
        return None, None, None
    except Exception as e:
        print(f"Error loading model state_dict: {e}")
        return None, None, None

    model.to(device)
    model.eval()
    
    print("sentiment model loaded successfully.")
    return model, tokenizer, device

def get_sentiment(text, model, tokenizer, device):
    """
    Gets a single sentiment score [-1, 1] for a given text
    using the loaded custom model.
    """
    if not isinstance(text, str):
        return 0.0

    inputs = tokenizer(
        text, 
        padding="max_length", 
        truncation=True, 
        max_length=128, 
        return_tensors="pt"
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    probs = torch.softmax(logits, dim=1).cpu().numpy()[0]

    score = probs[2] - probs[0]
    
    return float(score)

In [None]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
from dateutil.parser import parse as _parse 


GCP_PROJECT_ID = 'pivotal-glider-472219-r7'
BIGQUERY_DATASET = 'market_data'

print("Initializing BigQuery client...")
try:
    bq_client = bigquery.Client(project=GCP_PROJECT_ID)
    dataset_ref = bq_client.dataset(BIGQUERY_DATASET)
    bq_client.get_dataset(dataset_ref)
    print(f"BigQuery dataset {BIGQUERY_DATASET} already exists.")
except NotFound:
    print(f"BigQuery dataset {BIGQUERY_DATASET} not found. Creating...")
    try:
        bq_client.create_dataset(dataset_ref, timeout=30)
        print(f"Dataset {BIGQUERY_DATASET} created.")
    except Exception as e:
        print(f"ERROR: Could not create dataset. {e}")
        exit()
except Exception as e:
    print(f"ERROR: Could not initialize BigQuery client. {e}")
    print("Please ensure you have authenticated with 'gcloud auth application-default login'")
    exit()



def create_table_if_not_exists(table_name, schema, partition_field):
    """
    Creates a BQ table with partitioning if it doesn't already exist.
    """
    table_id = f"{GCP_PROJECT_ID}.{BIGQUERY_DATASET}.{table_name}"
    
    try:
        bq_client.get_table(table_id)
    except NotFound:
        print(f"Table {table_id} not found. Creating...")
        try:
            table = bigquery.Table(table_id, schema=schema)
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=partition_field,
            )
            bq_client.create_table(table)
            print(f"  Success. Table {table_id} created with partitioning on '{partition_field}'.")
        except Exception as e:
            print(f"  --- ERROR creating table {table_name} ---")
            print(f"  {e}")
            raise

def upload_to_bigquery(df, table_name, schema, partition_field='date'):
    """
    Uploads a DataFrame to BigQuery idempotently using a DELETE/INSERT pattern.
    """
    table_id = f"{GCP_PROJECT_ID}.{BIGQUERY_DATASET}.{table_name}"
    
   
    try:
        create_table_if_not_exists(table_name, schema, partition_field)
    except Exception as e:
        print(f"Aborting upload to {table_name} due to table creation error.")
        return

    
    df_upload = df.copy()

    if df_upload.empty:
        print(f"No data provided for {table_name}, skipping upload.")
        return

    if partition_field not in df_upload.columns:
        print(f"ERROR: Partition field '{partition_field}' not in DataFrame columns: {df_upload.columns}")
        return
    df_upload[partition_field] = pd.to_datetime(df_upload[partition_field]).dt.strftime('%Y-%m-%d')

    for col_schema in schema:
        col_name = col_schema.name
        if col_name not in df_upload.columns:
            continue
        
        if col_schema.field_type == 'NUMERIC':
            df_upload[col_name] = pd.to_numeric(df_upload[col_name]).round(4)
        elif col_schema.field_type == 'INTEGER':
             df_upload[col_name] = df_upload[col_name].astype(float).astype('Int64')

    df_upload = df_upload.fillna(pd.NA).where(pd.notna(df_upload), None)
    df_upload = df_upload.replace({np.nan: None})

    # --- 3. Delete Existing Data in Date Range ---
    min_date = df_upload[partition_field].min()
    max_date = df_upload[partition_field].max()

    if pd.isna(min_date) or pd.isna(max_date):
        print(f"  No valid dates found in data for {table_name}, skipping delete and insert.")
        return

    print(f"\nMaking upload idempotent: Deleting existing data in {table_name} between {min_date} and {max_date}...")
    
    try:
        query_params = [
            bigquery.ScalarQueryParameter("min_date", "DATE", min_date),
            bigquery.ScalarQueryParameter("max_date", "DATE", max_date),
        ]
        job_config = bigquery.QueryJobConfig(query_parameters=query_params)
        
        delete_query = f"""
            DELETE FROM `{table_id}`
            WHERE {partition_field} BETWEEN @min_date AND @max_date
        """
        
        delete_job = bq_client.query(delete_query, job_config=job_config)
        delete_job.result()
        
        print(f"  Success. Deleted {delete_job.num_dml_affected_rows} existing rows.")

    except Exception as e:
        print(f"  --- ERROR during DELETE operation for {table_name} ---")
        print(f"  {e}")
        print(f"  Aborting upload for this table.")
        return

   
    rows_to_insert = df_upload.to_dict('records')

    if not rows_to_insert:
        print(f"  No data to upload for {table_name} after processing.")
        return

    print(f"  Uploading {len(rows_to_insert)} new rows to {table_id} (using streaming insert)...")
    
    try:
        errors = bq_client.insert_rows_json(table_id, rows_to_insert)
        
        if not errors:
            print(f"  Success. Streamed data.")
        else:
            print(f"  --- ERROR uploading {table_name} ---")
            print("  Errors encountered during streaming insert:")
            for i, error in enumerate(errors):
                if i < 20:
                    print(f"  - Row {error['index']}: {error['errors']}")
                elif i == 20:
                    print(f"  ... and {len(errors) - 20} more errors.")
                    
    except Exception as e:
        print(f"  --- FATAL ERROR uploading {table_name} ---")
        print(f"  {e}")
        print("  DataFrame Info:")
        df_upload.info()
        print("\n  DataFrame Head:")
        print(df_upload.head())


In [None]:
import pandas as pd
import time
from google.cloud import bigquery
model, tokenizer, device = load_model_and_tokenizer()

if model is None:
    print("Failed to load model, exiting pipeline.")
    exit()

ASPECT_MAP = {
    "interest_rates": [
        "interest rate", "interest rates", "rate cuts", "rate cut", "fed rate cuts",
        "federal reserve", "central bank", "chair jerome powell", "fed", "rate",
        "federal open market", "open market committee", "lower interest rates"
    ],
    "inflation": [
        "inflation", "consumer price index", "cpi", "producer price index", "ppi",
        "personal consumption expenditures", "rising prices", "cost of living",
        "price", "prices", "consumer", "spending"
    ],
    "unemployment": [
        "unemployment", "labor market", "bureau labor statistics", "labor department report",
        "nonfarm payrolls", "jobless claims", "job growth", "layoffs", "job", "jobs", "labor"
    ],
    "stock_market": [
        "stock market", "wall street", "dow jones", "dow jones industrial", "jones industrial average",
        "nasdaq composite", "stock exchange", "new york stock", "market cap", "per share",
        "price target", "earnings per share", "stock", "stocks", "shares", "market"
    ],
    "recession": [
        "recession", "economic downturn", "negative growth", "gdp", "economy", "growth",
        "great financial crisis"
    ],
    "tech_ai": [
        "artificial intelligence", "ai", "meta", "nvidia", "google", "apple", "microsoft",
        "meta ray ban", "ray ban", "smart glasses", "meta smart glasses", "ceo mark zuckerberg",
        "machine learning", "data center", "data centers", "ai models", "scale ai", "openai",
        "meta superintelligence labs", "large language models", "nvidia ceo jensen"
    ],
    "politics_regulation": [
        "donald trump", "president donald trump", "white house", "trump administration",
        "federal trade commission", "securities exchange commission", "house oversight committee",
        "government", "tariffs", "china", "chinese", "u k", "trade"
    ],
    "finance_banking": [
        "goldman sachs", "morgan stanley", "bank america", "jpmorgan chase", "ceo jamie dimon",
        "private equity", "financial", "investment", "investors", "fund", "funds"
    ]
}

CATEGORIES = list(ASPECT_MAP.keys())

print("\nStarting Category-Based Sentiment Analysis with custom model...")
try:
    df_articles = pd.read_json('articles_export.json')
    print(f"Loaded {len(df_articles)} articles.")
except FileNotFoundError:
    print("ERROR: 'articles_export.json' not found.")
    exit()

df_articles['date'] = pd.to_datetime(df_articles['published_at']).dt.date

all_category_sentiments = []
start_time = time.time()

for index, article in df_articles.iterrows():
    if not isinstance(article['article_content'], str):
        continue

    article_date = article['date']
    article_id = article.get('id', index)
    article_content = article['article_content']
    article_content_lower = article_content.lower()

    article_sentiment_score = get_sentiment(article_content, model, tokenizer, device)

    for category, keywords in ASPECT_MAP.items():

        if any(keyword in article_content_lower for keyword in keywords):

            all_category_sentiments.append({
                "date": article_date,
                "article_id": article_id,
                "category": category,
                "sentiment_score": article_sentiment_score
            })

    if (index + 1) % 100 == 0:
        processed_count = index + 1
        elapsed_time = time.time() - start_time
        articles_per_sec = processed_count / elapsed_time if elapsed_time > 0 else 0
        print(f"  ...processed {processed_count} / {len(df_articles)} articles ({articles_per_sec:.2f} art/sec)")

end_time = time.time()
print(f"Finished processing all articles in {end_time - start_time:.2f} seconds.")

if not all_category_sentiments:
    print("No categories were found in any articles. Exiting.")
    exit()

print("\nAggregating sentiment scores by day...")
df_category_scores = pd.DataFrame(all_category_sentiments)

daily_category_sentiment = df_category_scores.groupby(['date', 'category'])['sentiment_score'].mean()

df_final_sentiment = daily_category_sentiment.unstack(level='category')

df_final_sentiment.columns = [f"sentiment_{col}" for col in df_final_sentiment.columns]

news_volume = df_articles.groupby('date').size().to_frame('news_volume')
df_final_sentiment = df_final_sentiment.join(news_volume)
df_final_sentiment = df_final_sentiment.fillna(0)
df_final_sentiment = df_final_sentiment.reset_index()

print("Daily Category-Based Sentiment is complete:")
print(df_final_sentiment.head())

print("\nPreparing schema and data for BigQuery...")
absa_sentiment_schema = [
    bigquery.SchemaField('date', 'DATE'),
    bigquery.SchemaField('news_volume', 'INTEGER'),
]

for category in CATEGORIES:
    col_name = f"sentiment_{category}"

    if col_name not in df_final_sentiment.columns:
         print(f"Warning: Category '{category}' not found in any articles. Adding empty column.")
         df_final_sentiment[col_name] = 0.0

    absa_sentiment_schema.append(
        bigquery.SchemaField(col_name, 'NUMERIC')
    )

schema_names = [field.name for field in absa_sentiment_schema]
try:
    df_final_sentiment = df_final_sentiment[schema_names]
except KeyError as e:
    print(f"ERROR: Mismatch between schema and DataFrame columns. Missing: {e}")
    print(f"DataFrame columns: {df_final_sentiment.columns.tolist()}")
    print(f"Expected schema columns: {schema_names}")
    exit()


print("Uploading to BigQuery...")
upload_to_bigquery(
    df_final_sentiment,
    'daily_sentiment_absa',
    absa_sentiment_schema,
    partition_field='date'
)

print("\n\nSuccessfully uploaded sentiment data")