In [108]:
# run predictions on individual stocks
from openai import OpenAI
import os
import json
from datetime import datetime, timezone, timedelta, time as dt_time
import pandas as pd
from google.cloud import storage
import uuid
import traceback
from google.cloud import storage
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import time
from dotenv import load_dotenv

load_dotenv()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./svc_acc_key.json"
MONGO_URI=os.environ["MONGO_URI"]
OPENAI_API_KEY=os.environ["OPENAI_API_KEY"]
STORAGE_BUCKET=os.environ["STORAGE_BUCKET"]

model = "gpt-4o-mini"
MAX_LEN_WORDS_PER_REQ = 60000
db = None


In [109]:
storage_client = storage.Client.from_service_account_json(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])
bucket = storage_client.get_bucket(STORAGE_BUCKET)

In [110]:
client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
db = client.get_database()

def get_db():
    return db

In [111]:
def collect_saved_articles_from_storage(scrapes):
    articles = []
    for scrape in scrapes:
        if 'bucket_key' not in scrape:
            # print("Bucket key not found")
            continue

        key = scrape['bucket_key']
        blob = bucket.blob(key)
        content = blob.download_as_string()
        if not content:
            print("Couldn't get article content from bucket")
            continue

        if len(content) < 100:
            print("article is too short, skipping")
            continue 
        
        try:
        # Try converting the content to a JSON object
            json_content = json.loads(content)
            articles.append(json_content)
        except Exception as e:
            # If conversion fails, just skip and continue
            continue
    return articles

In [112]:
def save_openai_resp_as_csv(df, df_filtered, run_id, lookback):   
    cur_time = datetime.now(timezone.utc)     
    key = f"predictions/{run_id}/{lookback}_predictions.csv"
    csv_data = df.to_csv(index=False)

    # Create a blob (the object in GCS)
    blob = bucket.blob(key)

    # Upload the CSV string as a file to the bucket
    blob.upload_from_string(csv_data, content_type='text/csv')
    db = get_db()
    db["predictions"].insert_one({
        "bucket_key": key,
        "lookback": lookback,
        "predicted_at": cur_time,
    })

    key_filtered = f"predictions/{run_id}/{lookback}_predictions_filtered.csv"
    blob_filtered = bucket.blob(key_filtered)
    csv_data_filtered = df_filtered.to_csv(index=False)
    blob_filtered.upload_from_string(csv_data_filtered, content_type='text/csv')
    print(f"saved {lookback}_predictions.csv")

def generate_analysis_for_article(stock_sym, article):
    client = OpenAI()
    system_content = f"""
    For the given article, predict if {stock_sym} will rise in the next trading window.
    If the article does not mention anything about {stock_sym}, return an answer of NA.
    Otherwise, return a YES or NO. The only acceptable responses are YES, NO or NA.
    """
    
    completion = client.chat.completions.create(
    model=model,
    messages=[
            {"role": "system", "content": system_content},
            {
                "role": "user",
                "content": article
            }
        ]
    )

    resp = completion.choices[0].message.content
    return resp

In [113]:
def get_stocks_list(run_id, lookback): 
    cur_time = datetime.now(timezone.utc)

    print(f"[predict] Get stocks list with lookback {lookback}, from time {cur_time}")

    lookback_from = cur_time - timedelta(hours=lookback) # change this to 6 hours lookback
    dup_articles_set = set()

    db = get_db()
    collection = db['scrapes']
    stock_prices_col = db["stock_prices"]

    # get everything from the run id that has been published in the last {lookback} hours
    recent_scrapes = collection.find({"run_id": run_id, "published_at": {"$gte": lookback_from}})

    recent_scrapes_dict = {}
    for scrape in recent_scrapes:
        if 'url' not in scrape:
            print("url is not in scrape")
            continue

        url_key = scrape['url']
        if url_key in dup_articles_set:
            continue 

        dup_articles_set.add(url_key)

        if "stock" not in scrape:
            continue
  
        stock = scrape['stock'].lower()
        if stock not in recent_scrapes_dict:
            recent_scrapes_dict[stock] = {
                "scrapes": []
            }
            del scrape["stock"]

        stock_data = recent_scrapes_dict[stock]
        stock_data["scrapes"].append(scrape)
        # recent_scrapes_dict[stock].append(scrape)
    
    # stock_prices = stock_prices_col.find({"run_id": run_id})
    # for sp in stock_prices:
    #     if not all(key in sp for key in ["stock", "pre_market_price", "prev_close"]):
    #         continue
            
    #     stock = sp["stock"]
    #     if stock not in recent_scrapes_dict:
    #         continue 
            
    #     pre_market_price = sp["pre_market_price"]
    #     prev_close = sp["prev_close"] = sp["prev_close"]
        
    #     recent_scrapes_dict[stock]["pre_market_price"] = pre_market_price
    #     recent_scrapes_dict[stock]["prev_close"] = prev_close
        
    print(f"[predict] found {len(dup_articles_set)} duplicate articles")
    return recent_scrapes_dict


In [114]:
def get_published_at(file_content_formatted, source):
    published_at = None
        
    if "published_at" in file_content_formatted:
        time_as_str = file_content_formatted["published_at"]
        if not time_as_str:
            print(f"time not found for {file_content_formatted["title"]}")
            return 

        try:
            published_at = datetime.strptime(time_as_str, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=timezone.utc)
        except ValueError:
            # If that fails, try without milliseconds
            published_at = datetime.strptime(time_as_str, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
    return published_at

In [115]:
def generate_analysis_for_stock(stock_sym, scrapes_for_stock):
    print(f"Generate analysis for stock {stock_sym}")
    # you need to get all the articles into an array based on the scrapes

    saved_articles = collect_saved_articles_from_storage(scrapes_for_stock)
    print(f"Got saved articles of length {len(saved_articles)} for stock {stock_sym}")

    all_responses = []
    for article in saved_articles:
        if "content" not in article:
            print("skipping openai, 'content' not found in article")
            continue 
        article_content = article['content']

        try:
            resp = generate_analysis_for_article(stock_sym, article_content)
            formatted_resp = resp.lower()
            if "yes" in formatted_resp:
                all_responses.append("YES")
            elif "no" in formatted_resp:
                all_responses.append("NO")
            else:
                all_responses.append("NA")
        except Exception as e:
            print(f"failure calling openai: {e}")
            
    return all_responses

In [116]:
def convert_rows_to_csv(rows):
    # Initialize a list to hold the rows for the DataFrame
    data = []
    
    # Process each key-value pair in the dictionary
    for symbol, values in rows.items():
        # Count occurrences of "YES", "NO", and "NA"
        yes_count = values.count('YES')
        no_count = values.count('NO')
        na_count = values.count('NA')
        
        # Append the data as a new row
        data.append([symbol, yes_count, no_count, na_count])
    
    # Create a DataFrame from the data
    df = pd.DataFrame(data, columns=['Symbol', 'YES', 'NO', 'NA'])
    # Sort the DataFrame by the "YES" column in descending order
    df.sort_values(by='YES', ascending=False, inplace=True)
    return df


In [117]:
def filter_and_order_df(df):
    # Filter rows where "YES" count is greater than 1 and greater than "NO" count
    filtered_df = df[(df['YES'] > 1) & (df['YES'] > df['NO'])].copy()
    
    # Calculate the difference and add it as a new column using .loc
    filtered_df.loc[:, 'YES_NO_DIFF'] = filtered_df['YES'] - filtered_df['NO']
    
    # Sort the DataFrame by the "YES_NO_DIFF" in descending order
    filtered_df.sort_values(by='YES_NO_DIFF', ascending=False, inplace=True)
    
    # Drop the "YES_NO_DIFF" column as it's not needed in the final output
    filtered_df.drop(columns=['YES_NO_DIFF'], inplace=True)
    
    return filtered_df

In [118]:
def run_analysis(run_id, lookback):
    stocks = get_stocks_list(run_id, lookback)
    if not stocks:
        print(f"[predict] not stocks found for prediction")
        return 

    print(f"[predict] creating prediction for {len(stocks)}")
    rows = {}
    current_time = datetime.now().astimezone(timezone.utc)

    for stock, stock_data in stocks.items():
        stock_sym = stock.lower()
        scrapes_for_stock = stock_data["scrapes"]
        
        responses = generate_analysis_for_stock(stock_sym, scrapes_for_stock)
        if not responses:
            continue
            
        rows[stock_sym] = responses
    df = convert_rows_to_csv(rows)
    df_filtered = filter_and_order_df(df)
    save_openai_resp_as_csv(df, df_filtered, run_id, lookback)

In [123]:
run_id = "c9eb6450-dafc-45c1-bce5-2cdfb81cc850"
lookback = 24
print("run_id: ", run_id)

run_id:  c9eb6450-dafc-45c1-bce5-2cdfb81cc850


In [124]:
def start(run_id, lookback):
    run_analysis(run_id, lookback)

In [125]:
def _start(run_id, lookback):

    current_time_utc = datetime.now(timezone.utc)
    print("Current time is ", current_time_utc)
    date_string_to_execute = "2024-10-08"

    # Convert to a datetime object
    date_object = datetime.strptime(date_string_to_execute, "%Y-%m-%d").date()
    target_time = datetime.combine(date_object, dt_time(5, 21, 0), timezone.utc)
    
    while current_time_utc < target_time:
        print("Waiting to launch...", current_time_utc)
        time.sleep(5)
        current_time_utc = datetime.now(timezone.utc)

    run_analysis(run_id, lookback)
    print("Now it's after", current_time_utc)
    print(current_time_utc)

In [126]:
start(run_id, lookback)

[predict] Get stocks list with lookback 24, from time 2024-10-08 23:31:51.690498+00:00
[predict] found 87 duplicate articles
[predict] creating prediction for 39
Generate analysis for stock bkr
Got saved articles of length 1 for stock bkr
Generate analysis for stock dltr
Got saved articles of length 1 for stock dltr
Generate analysis for stock mo
Got saved articles of length 1 for stock mo
Generate analysis for stock thc
Got saved articles of length 1 for stock thc
Generate analysis for stock oke
Got saved articles of length 1 for stock oke
Generate analysis for stock dxc
Got saved articles of length 2 for stock dxc
Generate analysis for stock ma
Got saved articles of length 1 for stock ma
Generate analysis for stock khc
Got saved articles of length 1 for stock khc
Generate analysis for stock acm
Got saved articles of length 3 for stock acm
Generate analysis for stock ip
Got saved articles of length 1 for stock ip
Generate analysis for stock v
Got saved articles of length 1 for stock v