In [112]:
# External libraries for text similarity, NLP, date parsing, and sentiment analysis
from thefuzz import fuzz, process  # thefuzz (formerly fuzzywuzzy) for fuzzy string matching
import spacy                       # NLP library for Named Entity Recognition
from datetime import datetime, timedelta  # Standard library for working with date and time
import dateparser                  # Human-friendly natural language date parser
import re                          # Regular expressions for text cleaning
import nltk                        # Natural Language Toolkit for stopwords and lemmatization
from nltk.corpus import stopwords  # Stopwords list
from transformers import AutoTokenizer  # Tokenizer for model input formatting
from collections import defaultdict      # Dictionary subclass for convenient default values
from transformers import pipeline        # Prebuilt Hugging Face pipelines for NLP tasks
import threading                   # Threading module for thread-safe operations


class Model:
    """
    AI Model class for financial text analysis.

    This class:
    - Extracts organization names using Named Entity Recognition.
    - Matches them against a provided Fortune 500 dictionary.
    - Detects timeframes from plain text.
    - Runs sentiment analysis using a FinBERT model.
    - Updates prediction scores in a thread-safe manner.
    """

    # Download NLTK resources (if not already present)
    nltk.download('stopwords')
    nltk.download('wordnet')

    def __init__(self, dict_fortune500, model="ProsusAI/finbert", tokenizer="ProsusAI/finbert"):
        """
        Initializes the AI pipeline components.

        Parameters:
        - dict_fortune500 (dict): A dictionary mapping company names to their stock tickers.
        - model (str): HuggingFace model name for sentiment analysis.
        - tokenizer (str): HuggingFace tokenizer name.
        """
        # Normalize all company names in the input dictionary for fuzzy matching
        self.dict_fortune500 = {self.__normalize_text__(k): v for k, v in dict_fortune500.items()}

        # Stores prediction scores for each matched organization
        self.predictions = defaultdict(float)

        # Load spaCy's small English model for NER
        self.nlp_ner = spacy.load("en_core_web_sm")

        # Load tokenizer for input preprocessing
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)

        # Create a text-classification pipeline for sentiment prediction using FinBERT
        self.pipe = pipeline("text-classification", model=model)

        # Lock to ensure thread-safe updates of prediction scores
        self.lock = threading.Lock()

    def __normalize_text__(self, text):
        """
        Normalize input text by lowercasing and stripping special characters.

        Parameters:
        - text (str): Input string to normalize.

        Returns:
        - str: Cleaned and normalized string.
        """
        return ''.join(e for e in text.lower() if e.isalnum() or e.isspace()).strip()

    def __match_organization__(self, extracted_org, threshold=80):
        """
        Matches an extracted organization name to the closest Fortune 500 company name.

        Parameters:
        - extracted_org (str): The name detected by NER.
        - threshold (int): Minimum fuzzy match score to consider a valid match.

        Returns:
        - str | None: The matching company's ticker symbol, or None if no match exceeds threshold.
        """
        extracted_org_norm = self.__normalize_text__(extracted_org)

        # Use fuzzy string matching to find the closest company name
        best_match, score = process.extractOne(
            extracted_org_norm,
            list(self.dict_fortune500.keys()),
            scorer=fuzz.token_sort_ratio
        )

        # Return ticker symbol if the match is strong enough
        if score >= threshold:
            return self.dict_fortune500[best_match]
        else:
            return None

    def __extract_entities__(self, text):
        """
        Extracts organization names and estimates a time frame from the input text.

        Uses:
        - spaCy's NER model for organizations.
        - dateparser to extract and classify any temporal reference.

        Parameters:
        - text (str): The input text to analyze.

        Returns:
        - orgs (list): List of unique organization names detected.
        - time_frame (str): One of ['Short-term', 'Medium-term', 'Long-term', 'Uncertain'].
        """
        doc = self.nlp_ner(text)  # Apply NER model
        orgs = set()              # Set to store unique organization names
        time_frame = "Uncertain"  # Default classification

        for ent in doc.ents:
            if ent.label_ == "ORG":
                orgs.add(ent.text)

        parsed_date = dateparser.parse(text)

        if parsed_date:
            current_date = datetime.today()

            if parsed_date < current_date + timedelta(days=7):
                time_frame = "Short-term"
            elif parsed_date < current_date + timedelta(days=365):
                time_frame = "Medium-term"
            else:
                time_frame = "Long-term"

        return list(orgs), time_frame

    def __preprocess_text__(self, text, max_tokens=512):
        """
        Cleans and tokenizes the input text, preparing it for model input.

        Parameters:
        - text (str): Raw text to preprocess.
        - max_tokens (int): Maximum number of tokens allowed by the model.

        Returns:
        - str: Preprocessed and truncated text.
        """
        if not text:
            return ""

        # Remove URLs and extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'http\S+', '', text)

        # Tokenize and truncate
        tokens = self.tokenizer.tokenize(text)
        tokens = tokens[:max_tokens]

        return self.tokenizer.convert_tokens_to_string(tokens)

    def __update_predictions__(self, orgs, result):
        """
        Safely updates the prediction scores for the matched organizations.

        Parameters:
        - orgs (list): List of matched organization ticker symbols.
        - result (dict): Sentiment prediction result from FinBERT.
        """
        with self.lock:
            if result['label'] == 'positive':
                for org in orgs:
                    self.predictions[org] += result['score']
            elif result['label'] == 'negative':
                for org in orgs:
                    self.predictions[org] -= result['score']

    def predict(self, posts):
        """
        Main prediction pipeline:
        - Preprocess text.
        - Extract organizations and time frame.
        - Match organizations to Fortune 500 tickers.
        - Predict sentiment using FinBERT.
        - Update prediction scores.

        Parameters:
        - posts (str): Input text content.
        """
        if len(posts) < 10:
            return  # Skip very short texts

        cleaned_text = self.__preprocess_text__(posts)
        entities, timeframe = self.__extract_entities__(cleaned_text)

        if len(entities) < 1:
            return  # No organization detected

        orgs = []
        for i in entities:
            matched_ticker = self.__match_organization__(i, 70)
            if matched_ticker:
                orgs.append(matched_ticker)

        if len(orgs) > 0:
            # Ensure text length is valid for the model
            if "Token indices sequence length is longer than the specified maximum sequence length for this model" in cleaned_text:
                return  # Invalid input for the model

            results = self.pipe(cleaned_text[:512])
            self.__update_predictions__(orgs, results[0])  # Thread-safe score update


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/paulbabu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/paulbabu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [113]:
import praw
from datetime import datetime, timedelta, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

class Predictor:
    """
    Predictor class to fetch financial-related Reddit posts and news articles,
    and predict sentiment or relevance using a machine learning model.
    """

    def __init__(self, companies={'Apple Inc.': 'AAPL'}, model="ProsusAI/finbert", tokenizer="ProsusAI/finbert"):
        """
        Initialize the Predictor object.

        Parameters:
        - companies (dict): Mapping of company names to their stock tickers.
        - model (str): Pretrained model identifier.
        - tokenizer (str): Pretrained tokenizer identifier.
        """
        self.companies = companies
        self.model = model
        self.tokenizer = tokenizer

    def __create_Model__(self):
        """
        Helper method to create a new instance of the prediction Model.

        Returns:
        - Model: An instance of the prediction model.
        """
        return Model(self.companies, self.model, self.tokenizer)

    def get_reddit_predictions(
        self,
        min_upvotes=5,
        subreddits=['wallstreetbets', 'stocks', 'investing', 'options'],
        start_date=None,
        end_date=None,
        CLIENT_ID="DrxLYbhcf1pudFwcNsV2Hw",
        CLIENT_SECRET="xVAig1oKsm1ET7NkNxiglGjpDFy6_g",
        USER_AGENT="my_reddit_stock_scraper v1.0 (by u/Funny_Low1871)"
    ):
        """
        Fetch Reddit posts from specified subreddits within a date range,
        filter by minimum upvotes, and run predictions on their text content.

        Parameters:
        - min_upvotes (int): Minimum number of upvotes a post must have to be considered.
        - subreddits (list): List of subreddit names to search.
        - start_date (str): Start date in 'YYYY-MM-DD' format (default is yesterday).
        - end_date (str): End date in 'YYYY-MM-DD' format (default is now).
        - CLIENT_ID (str): Reddit API client ID (required).
        - CLIENT_SECRET (str): Reddit API client secret (required).
        - USER_AGENT (str): Reddit API user agent (required).

        Returns:
        - dict: Predictions made by the model.
        """
        redditModel = self.__create_Model__()

        # Handle default start and end dates
        if start_date is None:
            start_dt = datetime.utcnow().date() - timedelta(days=1)
            start_dt = datetime.combine(start_dt, datetime.min.time())
        else:
            start_dt = datetime.strptime(start_date, "%Y-%m-%d")

        if end_date is None:
            end_dt = datetime.utcnow()
        else:
            end_dt = datetime.strptime(end_date, "%Y-%m-%d") + timedelta(days=1)

        start_time = int(start_dt.timestamp())
        end_time = int(end_dt.timestamp())

        # Reddit API credentials (replace with your own if needed)
        CLIENT_ID = CLIENT_ID
        CLIENT_SECRET = CLIENT_SECRET
        USER_AGENT = USER_AGENT

        # Initialize Reddit API client using PRAW
        reddit = praw.Reddit(
            client_id=CLIENT_ID,
            client_secret=CLIENT_SECRET,
            user_agent=USER_AGENT
        )

        def process_subreddit(name):
            """
            Fetches and processes posts from a single subreddit.

            Parameters:
            - name (str): Name of the subreddit.
            """
            count = 0
            subreddit = reddit.subreddit(name)

            for post in subreddit.new(limit=1000):
                post_time = int(post.created_utc)

                if post_time < start_time:
                    break  # Posts older than start_time are skipped

                count += 1

                # Apply date and upvote filter
                if start_time <= post_time <= end_time and post.score >= min_upvotes:
                    post_data = {"text": f"{post.title} {post.selftext}"}
                    redditModel.predict(post_data["text"])  # Predict sentiment or relevance

            print(f"Processed {count} posts from subreddit: {name}")

        # Multi-threaded processing of subreddits
        with ThreadPoolExecutor(max_workers=len(subreddits)) as executor:
            future_to_subreddit = {executor.submit(process_subreddit, name): name for name in subreddits}

        for future in as_completed(future_to_subreddit):
            subreddit = future_to_subreddit[future]
            try:
                future.result()
            except Exception as e:
                print(f"A thread raised an exception in subreddit '{subreddit}': {e}")

        return redditModel.predictions

    def predict_from_news(
            self,
            api_key="80a4b4a81d11446d8640c1d38ccb7051",
            query="stocks",
            from_date=None,
            to_date=None
    ):
        """
        Fetch news articles using NewsAPI and predict sentiment or relevance.

        Parameters:
        - api_key (str): Your NewsAPI authentication key.
        - query (str): Search keyword for news articles.
        - from_date (str): Start date in 'YYYY-MM-DD' format (default: 24 hours ago).
        - to_date (str): End date in 'YYYY-MM-DD' format (default: today).

        Returns:
        - dict: Predictions made by the model.
        """
        newsModel = self.__create_Model__()

        # Set default date range: last 24 hours
        if to_date is None:
            to_date = datetime.utcnow().date().isoformat()

        if from_date is None:
            from_date = (datetime.utcnow() - timedelta(hours=24)).date().isoformat()

        url = "https://newsapi.org/v2/everything"
        params = {
            'q': query,
            'from': from_date,
            'to': to_date,
            'sortBy': 'publishedAt',
            'language': 'en',
            'apiKey': api_key
        }

        response = requests.get(url, params=params)

        if response.status_code == 200:
            articles = response.json().get("articles", [])
            if not articles:
                print(f"No news articles found for query '{query}' between {from_date} and {to_date}.")
            else:
                for i, article in enumerate(articles, 1):
                    newsModel.predict(article["title"])  # Predict sentiment or relevance
        else:
            print(f"Error {response.status_code}: {response.json().get('message')}")

        return newsModel.predictions


In [114]:
import yfinance as yf
from datetime import datetime, timedelta

class StockEvaluator:
    
    def fetch_stock_data(self, ticker: str, date: str):
        """
        Fetches the stock data (open, close, and price change) for a given stock ticker on a specific date.

        Args:
            ticker (str): Stock ticker symbol (e.g., 'AAPL', 'TSLA').
            date (str): Date in 'YYYY-MM-DD' format.

        Returns:
            dict: Contains open price, close price, and the change in price.
                  Returns an error message if no data is found.
        """
        # Create a Ticker object using the yfinance API
        stock = yf.Ticker(ticker)
        
        # Prepare the date range to query the Yahoo Finance API
        start_date = date
        end_date = (datetime.strptime(date, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
        
        # Fetch historical stock data from Yahoo Finance
        data = stock.history(start=start_date, end=end_date)

        # Check if no data was found (e.g., market may be closed)
        if data.empty:
            return {"error": f"No data found for {ticker} on {date} (market may have been closed)."}

        # Extract open, close, and price change values
        open_price = data['Open'].iloc[0]
        close_price = data['Close'].iloc[0]
        price_change = close_price - open_price

        # Return the stock data
        return {
            "ticker": ticker,
            "date": date,
            "open": round(open_price, 2),
            "close": round(close_price, 2),
            "change": round(price_change, 2)
        }
    
    def evaluate_predictions(self, predictions: dict, date: str):
        """
        Evaluates stock predictions by fetching actual stock data for the given date 
        and compares it to the predicted sentiment scores.

        Args:
            predictions (dict): A dictionary where keys are stock tickers and values are predicted scores.
            date (str): The date in 'YYYY-MM-DD' format.

        Prints the actual stock data and compares it to the predicted score.
        """
        for ticker, prediction in predictions.items():
            # Get actual stock data for the ticker on the specified date
            result = self.fetch_stock_data(ticker, date)
            
            # Print predicted score and the actual stock data
            print(f"\nTicker: {ticker}")
            print(f"Predicted Score: {prediction}")
            print(f"Stock Data: {result}")
    
    def evaluate_sentiment_accuracy(self, predictions: dict, date: str):
        """
        Evaluates the sentiment classification predictions against actual stock price movements 
        and calculates accuracy, precision, recall, and F1 score.

        Args:
            predictions (dict): A dictionary where keys are stock tickers and values are predicted sentiment scores.
            date (str): The date in 'YYYY-MM-DD' format.

        Returns:
            dict: A dictionary containing accuracy, precision, recall, and F1 score.
        """
        # Initialize counters for the confusion matrix
        TP = TN = FP = FN = 0

        # Iterate through the predictions and calculate the confusion matrix
        for ticker, prediction in predictions.items():
            result = self.fetch_stock_data(ticker, date)
            
            # Ensure the result contains valid price change data
            if result and 'change' in result:
                predicted_up = prediction >= 0  # Predicted sentiment (up or down)
                actual_up = result['change'] >= 0  # Actual stock movement (up or down)

                # Update the confusion matrix
                if predicted_up and actual_up:
                    TP += 1  # True positive
                elif predicted_up and not actual_up:
                    FP += 1  # False positive
                elif not predicted_up and actual_up:
                    FN += 1  # False negative
                else:
                    TN += 1  # True negative

                # Print prediction result for each ticker
                status = "✅ Correct" if predicted_up == actual_up else "❌ Wrong"
                print(f"{ticker}: Predicted={prediction:.2f}, Actual Change={result['change']:.2f} --> {status}")

        # Calculate performance metrics
        total = TP + TN + FP + FN
        if total == 0:
            print("\nNo valid stock data to evaluate.")
            return

        accuracy = (TP + TN) / total
        precision = TP / (TP + FP) if TP + FP else 0
        recall = TP / (TP + FN) if TP + FN else 0
        f1_score = 2 * precision * recall / (precision + recall) if precision + recall else 0

        # Return the evaluation metrics
        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1_score
        }


In [115]:
import matplotlib.pyplot as plt

class Plotter:
    def __init__(self):
        self.dates = []
        self.predictions = []
        self.actual_changes = []

    def update_data(self, date: str, prediction: float, actual_change: float):
        self.dates.append(date)
        self.predictions.append(prediction)
        self.actual_changes.append(actual_change)

    def plot_comparison(self):
        """
        Plots a comparison between the predicted stock sentiment and actual stock price changes.
        """
        plt.figure(figsize=(10, 6))
        plt.plot(self.dates, self.predictions, label='Predicted Sentiment', marker='o')
        plt.plot(self.dates, self.actual_changes, label='Actual Stock Change', marker='x')
        plt.xlabel('Date')
        plt.ylabel('Sentiment / Stock Change')
        plt.title('Predicted Sentiment vs Actual Stock Change')
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
import pandas as pd
def main():
    # Load the Fortune 500 data
    fortune_500_companies = pd.read_csv('fortune_500_list.csv')
    fortune_500_companies["Company Name"] = fortune_500_companies["Security"]
    dict_fortune500 = dict(zip(fortune_500_companies['Company Name'], fortune_500_companies['Symbol']))
    subreddits = [
    'wallstreetbets', 'stocks', 'investing', 'options',
    'StockMarket', 'pennystocks', 'securityanalysis', 'ValueInvesting',
    'dividends', 'RobinHood', 'Daytrading', 'Forex', 'quantfinance',
    'FinancialIndependence', 'SPACs', 'personalfinance', 'ETF',
    'algotrading', 'StockNews'
]
    # Get the last two days
    today = datetime.today().date()
    yesterday = str(today - timedelta(days=1))
    two_days_ago = str(today - timedelta(days=2))
    today= str(today)
    print(f"Today: {today}, Yesterday: {yesterday}, Two days ago: {two_days_ago}")
    # Set up the Evaluator and Plotter
    evaluator = StockEvaluator()
    plotter = Plotter()
    # Get Reddit predictions    
    predictor= Predictor(companies=dict_fortune500)

    
    reddit_predictions1 = predictor.get_reddit_predictions(min_upvotes=5,subreddits=subreddits, start_date=two_days_ago, end_date=yesterday)
    news_predictions1 = predictor.predict_from_news(from_date=two_days_ago, to_date=yesterday)

    # reddit_predictions2 = predictor.get_reddit_predictions(min_upvotes=5,subreddits=subreddits, start_date=yesterday, end_date=today)
    # news_predictions2 = predictor.predict_from_news(from_date=yesterday, to_date=today)
    
    results_reddit = evaluator.evaluate_sentiment_accuracy(reddit_predictions1, today)
    results_news = evaluator.evaluate_sentiment_accuracy(news_predictions1, today)

    # Print the results     
    print("\nReddit Predictions Results:")
    print(results_reddit)
    print("\nNews Predictions Results:")
    print(results_news)
main()



Today: 2025-04-15, Yesterday: 2025-04-14, Two days ago: 2025-04-13


Device set to use mps:0


Processed 0 posts from subreddit: ETF
Processed 2 posts from subreddit: SPACs
Processed 0 posts from subreddit: StockNews


Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors


Processed 2 posts from subreddit: securityanalysis
Processed 6 posts from subreddit: RobinHood
Processed 66 posts from subreddit: Forex
Processed 6 posts from subreddit: FinancialIndependence
Processed 38 posts from subreddit: quantfinance
Processed 15 posts from subreddit: algotrading
Processed 107 posts from subreddit: dividends
Processed 45 posts from subreddit: pennystocks
Processed 51 posts from subreddit: options
Processed 49 posts from subreddit: ValueInvesting
Processed 78 posts from subreddit: wallstreetbets
Processed 235 posts from subreddit: Daytrading
Processed 106 posts from subreddit: investing
Processed 102 posts from subreddit: StockMarket
Processed 489 posts from subreddit: personalfinance
Processed 138 posts from subreddit: stocks


Device set to use mps:0
Device set to use mps:0


Processed 0 posts from subreddit: ETF


Token indices sequence length is longer than the specified maximum sequence length for this model (714 > 512). Running this sequence through the model will result in indexing errors


Processed 0 posts from subreddit: StockNews
Processed 1 posts from subreddit: securityanalysis
Processed 6 posts from subreddit: RobinHood
Processed 2 posts from subreddit: SPACs
Processed 2 posts from subreddit: FinancialIndependence
Processed 47 posts from subreddit: Forex
Processed 10 posts from subreddit: algotrading
Processed 77 posts from subreddit: dividends
Processed 30 posts from subreddit: quantfinance
Processed 37 posts from subreddit: pennystocks
Processed 31 posts from subreddit: ValueInvesting
Processed 69 posts from subreddit: investing
Processed 169 posts from subreddit: Daytrading
Processed 43 posts from subreddit: options
Processed 62 posts from subreddit: StockMarket
Processed 315 posts from subreddit: personalfinance
Processed 59 posts from subreddit: wallstreetbets
Processed 98 posts from subreddit: stocks


Device set to use mps:0


VST: Predicted=0.84, Actual Change=2.49 --> ✅ Correct
FDX: Predicted=-0.88, Actual Change=-2.59 --> ✅ Correct
CCI: Predicted=-0.90, Actual Change=0.53 --> ❌ Wrong
TECH: Predicted=0.88, Actual Change=-1.19 --> ❌ Wrong
NVDA: Predicted=0.89, Actual Change=1.23 --> ✅ Correct
NDAQ: Predicted=0.05, Actual Change=0.22 --> ✅ Correct
WFC: Predicted=0.77, Actual Change=1.10 --> ✅ Correct
ON: Predicted=0.77, Actual Change=0.10 --> ✅ Correct
UAL: Predicted=0.77, Actual Change=1.35 --> ✅ Correct
MTB: Predicted=0.77, Actual Change=0.94 --> ✅ Correct
MS: Predicted=0.77, Actual Change=0.35 --> ✅ Correct
JNJ: Predicted=0.77, Actual Change=-2.08 --> ❌ Wrong
NFLX: Predicted=0.77, Actual Change=26.28 --> ✅ Correct
BAC: Predicted=0.77, Actual Change=0.19 --> ✅ Correct
BX: Predicted=-0.82, Actual Change=3.11 --> ❌ Wrong
HII: Predicted=-0.66, Actual Change=-0.68 --> ✅ Correct
AMT: Predicted=-1.81, Actual Change=-1.27 --> ✅ Correct
AAPL: Predicted=0.90, Actual Change=0.29 --> ✅ Correct
TKO: Predicted=-0.67, A