In [3]:
import json
import pandas as pd
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import sys

def load_jsonl(file_path):
    """Load JSONL file into a pandas DataFrame"""
    print(f"Loading data from {file_path}...")
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    df = pd.DataFrame(data)
    print(f"Loaded {len(df)} articles")
    return df

def load_finbert_model():
    """Load the FinBERT model"""
    print("Loading FinBERT model...")
    try:
        tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        model.eval()
        print(f"Model loaded successfully on {device}")
        return tokenizer, model, device
    except Exception as e:
        print(f"Error loading model: {e}")
        sys.exit(1)

def analyze_sentiment(text, tokenizer, model, device, max_length=512):
    """Analyze sentiment using FinBERT"""
    try:
        # Truncate text if too long
        inputs = tokenizer(text, return_tensors="pt", truncation=True, 
                          max_length=max_length, padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        # FinBERT outputs: [positive, negative, neutral]
        probs = predictions[0].cpu().numpy()
        sentiment_map = {0: 'positive', 1: 'negative', 2: 'neutral'}
        sentiment = sentiment_map[np.argmax(probs)]
        
        return sentiment, probs
    except Exception as e:
        print(f"Error analyzing sentiment: {e}")
        return 'neutral', np.array([0.33, 0.33, 0.34])

def generate_monthly_signals(df, tokenizer, model, device, start_date='2020-01', end_date='2024-12'):
    """Generate monthly buy/hold/sell signals for each company"""
    # Convert publishdate to datetime
    print("\nProcessing dates...")
    df['Publishdate'] = pd.to_datetime(df['Publishdate'], errors='coerce')
    
    # Remove rows with invalid dates
    df = df.dropna(subset=['Publishdate'])
    
    # Filter date range
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date) + pd.offsets.MonthEnd(0)
    df = df[(df['Publishdate'] >= start) & (df['Publishdate'] <= end)].copy()
    
    print(f"Filtered to {len(df)} articles between {start_date} and {end_date}")
    
    # Add year-month column
    df['year_month'] = df['Publishdate'].dt.to_period('M')
    
    # Analyze sentiment for each article
    print("\nAnalyzing sentiment for articles...")
    sentiments = []
    sentiment_scores = []
    
    total = len(df)
    for idx, row in df.iterrows():
        if len(sentiments) % 100 == 0:
            print(f"Progress: {len(sentiments)}/{total} articles processed")
        
        # Combine title and text
        text = str(row.get('Title', ''))
        if pd.notna(row.get('Text')):
            text = text + " " + str(row['Text'])
        
        if not text.strip():
            sentiments.append('neutral')
            sentiment_scores.append(0.0)
            continue
            
        sentiment, probs = analyze_sentiment(text, tokenizer, model, device)
        sentiments.append(sentiment)
        
        # Calculate sentiment score: positive - negative
        score = float(probs[0] - probs[1])
        sentiment_scores.append(score)
    
    df['sentiment'] = sentiments
    df['sentiment_score'] = sentiment_scores
    
    # Generate monthly signals per company
    print("\nGenerating monthly signals...")
    monthly_signals = []
    
    grouped = df.groupby(['symbol', 'year_month'])
    for (symbol, year_month), group in grouped:
        avg_score = group['sentiment_score'].mean()
        article_count = len(group)
        
        # Generate signal based on average sentiment score
        # Positive threshold: > 0.1, Negative threshold: < -0.1
        if avg_score > 0.1:
            signal = 'buy'
        elif avg_score < -0.1:
            signal = 'sell'
        else:
            signal = 'hold'
        
        company_name = group['company'].iloc[0] if 'company' in group.columns else ''
        
        monthly_signals.append({
            'symbol': symbol,
            'company': company_name,
            'year_month': str(year_month),
            'signal': signal,
            'avg_sentiment_score': round(avg_score, 4),
            'article_count': article_count,
            'positive_count': int((group['sentiment'] == 'positive').sum()),
            'negative_count': int((group['sentiment'] == 'negative').sum()),
            'neutral_count': int((group['sentiment'] == 'neutral').sum())
        })
    
    signals_df = pd.DataFrame(monthly_signals)
    return signals_df, df

def main():
    # Configuration
    file_path = "stock_data_articles.jsonl"  # Update with your file path
    
    # Load the dataset
    df = load_jsonl(file_path)
    
    if df.empty:
        print("Error: No data loaded")
        return
    
    print(f"\nDataset info:")
    print(f"  Columns: {list(df.columns)}")
    if 'publishdate' in df.columns:
        print(f"  Date range: {df['publishdate'].min()} to {df['publishdate'].max()}")
    if 'symbol' in df.columns:
        print(f"  Number of companies: {df['symbol'].nunique()}")
    
    # Load model
    tokenizer, model, device = load_finbert_model()
    
    # Generate signals
    signals_df, analyzed_df = generate_monthly_signals(df, tokenizer, model, device)
    
    # Save results
    print("\nSaving results...")
    signals_df.to_csv('monthly_signals.csv', index=False)
    analyzed_df.to_csv('analyzed_articles.csv', index=False)
    
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    print(f"Total monthly signals generated: {len(signals_df)}")
    print(f"\nSignal distribution:")
    print(signals_df['signal'].value_counts().to_string())
    
    print("\n" + "="*50)
    print("SAMPLE SIGNALS")
    print("="*50)
    print(signals_df.head(10).to_string())
    
    print(f"\nResults saved to:")
    print(f"  - monthly_signals.csv")
    print(f"  - analyzed_articles.csv")

ERROR! Session/line number was not unique in database. History logging moved to new session 389


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os

# USE os.environ TO SET TOKEN

In [5]:
import pandas as pd

df = pd.read_json("hf://datasets/KrossKinetic/SP500-Financial-News-Articles-Time-Series/stock_data_articles.jsonl", lines=True)


In [5]:
import pandas as pd
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import os

# Disable transformers progress bars to avoid context errors
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

def load_finbert_model():
    """Load the FinBERT model"""
    print("Loading FinBERT model...")
    try:
        tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        model.eval()
        print(f"Model loaded successfully on {device}")
        return tokenizer, model, device
    except Exception as e:
        print(f"Error loading model: {e}")
        raise

def analyze_sentiment(text, tokenizer, model, device, max_length=512):
    """Analyze sentiment using FinBERT"""
    try:
        # Truncate text if too long
        inputs = tokenizer(text, return_tensors="pt", truncation=True, 
                          max_length=max_length, padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        # FinBERT outputs: [positive, negative, neutral]
        probs = predictions[0].cpu().numpy()
        sentiment_map = {0: 'positive', 1: 'negative', 2: 'neutral'}
        sentiment = sentiment_map[np.argmax(probs)]
        
        return sentiment, probs
    except Exception as e:
        print(f"Error analyzing sentiment: {e}")
        return 'neutral', np.array([0.33, 0.33, 0.34])

def generate_monthly_signals(df, tokenizer, model, device, start_date='2020-01', end_date='2024-12'):
    """Generate monthly buy/hold/sell signals for each company"""
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Convert publishdate to datetime - handle different column name cases
    date_col = None
    for col in df.columns:
        if col.lower() == 'publishdate':
            date_col = col
            break
    
    if date_col is None:
        raise ValueError("Could not find 'publishdate' column in dataframe")
    
    print(f"\nProcessing dates from column '{date_col}'...")
    df['publishdate'] = pd.to_datetime(df[date_col], errors='coerce')
    
    # Remove rows with invalid dates
    df = df.dropna(subset=['publishdate'])
    
    # Filter date range
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date) + pd.offsets.MonthEnd(0)
    df = df[(df['publishdate'] >= start) & (df['publishdate'] <= end)].copy()
    
    print(f"Filtered to {len(df)} articles between {start_date} and {end_date}")
    
    # Add year-month column
    df['year_month'] = df['publishdate'].dt.to_period('M')
    
    # Find title and text columns (case-insensitive)
    title_col = next((col for col in df.columns if col.lower() == 'title'), None)
    text_col = next((col for col in df.columns if col.lower() == 'text'), None)
    symbol_col = next((col for col in df.columns if col.lower() == 'symbol'), None)
    company_col = next((col for col in df.columns if col.lower() == 'company'), None)
    
    if symbol_col is None:
        raise ValueError("Could not find 'symbol' column in dataframe")
    
    # Analyze sentiment for each article
    print("\nAnalyzing sentiment for articles...")
    sentiments = []
    sentiment_scores = []
    
    total = len(df)
    for idx, row in df.iterrows():
        if len(sentiments) % 100 == 0:
            print(f"Progress: {len(sentiments)}/{total} articles processed")
        
        # Combine title and text
        text = str(row[title_col]) if title_col and pd.notna(row.get(title_col)) else ''
        if text_col and pd.notna(row.get(text_col)):
            text = text + " " + str(row[text_col])
        
        if not text.strip():
            sentiments.append('neutral')
            sentiment_scores.append(0.0)
            continue
            
        sentiment, probs = analyze_sentiment(text, tokenizer, model, device)
        sentiments.append(sentiment)
        
        # Calculate sentiment score: positive - negative
        score = float(probs[0] - probs[1])
        sentiment_scores.append(score)
    
    df['sentiment'] = sentiments
    df['sentiment_score'] = sentiment_scores
    
    # Generate monthly signals per company
    print("\nGenerating monthly signals...")
    monthly_signals = []
    
    grouped = df.groupby([symbol_col, 'year_month'])
    for (symbol, year_month), group in grouped:
        avg_score = group['sentiment_score'].mean()
        article_count = len(group)
        
        # Generate signal based on average sentiment score
        # Positive threshold: > 0.1, Negative threshold: < -0.1
        if avg_score > 0.2:
            signal = 'buy'
        elif avg_score < -0.2:
            signal = 'sell'
        else:
            signal = 'hold'
        
        company_name = group[company_col].iloc[0] if company_col and company_col in group.columns else ''
        
        monthly_signals.append({
            'symbol': symbol,
            'company': company_name,
            'year_month': str(year_month),
            'signal': signal,
            'avg_sentiment_score': round(avg_score, 4),
            'article_count': article_count,
            'positive_count': int((group['sentiment'] == 'positive').sum()),
            'negative_count': int((group['sentiment'] == 'negative').sum()),
            'neutral_count': int((group['sentiment'] == 'neutral').sum())
        })
    
    signals_df = pd.DataFrame(monthly_signals)
    return signals_df, df

# Main execution - use your existing df
print(f"Dataset info:")
print(f"  Shape: {df.shape}")
print(f"  Columns: {list(df.columns)}")

# Load model
tokenizer, model, device = load_finbert_model()

# Generate signals
signals_df, analyzed_df = generate_monthly_signals(df, tokenizer, model, device)

# Save results
print("\nSaving results...")
signals_df.to_csv('monthly_signals.csv', index=False)
analyzed_df.to_csv('analyzed_articles.csv', index=False)

print("\n" + "="*50)
print("SUMMARY")
print("="*50)
print(f"Total monthly signals generated: {len(signals_df)}")
print(f"\nSignal distribution:")
print(signals_df['signal'].value_counts())

print("\n" + "="*50)
print("SAMPLE SIGNALS")
print("="*50)
print(signals_df.head(10))

print(f"\nResults saved to:")
print(f"  - monthly_signals.csv")
print(f"  - analyzed_articles.csv")

Dataset info:
  Shape: (4589, 7)
  Columns: ['id_', 'links', 'symbol', 'company', 'Title', 'Text', 'Publishdate']
Loading FinBERT model...



KeyboardInterrupt



KeyboardInterrupt: 