In [1]:
import os
from datetime import datetime, timedelta

import pandas as pd
import requests

DT_FORMAT = '%Y-%m-%d'


def add_days(dt: str, days: int, dt_format: str = DT_FORMAT) -> str:
    return datetime.strftime(datetime.strptime(dt, dt_format) + timedelta(days=days), dt_format)


def get_klines(symbol: str, start_date: str, end_date: str, interval: str = "1d") -> pd.DataFrame:
    sdt = int(datetime.strptime(start_date, DT_FORMAT).timestamp() * 1000)
    edt = int(datetime.strptime(end_date, DT_FORMAT).timestamp() * 1000)

    params = {
        "symbol": symbol,
        "interval": interval,
        "startTime": sdt,
        "endTime": edt
    }

    klines = requests.get('https://api.binance.com/api/v3/klines', params=params)
    klines_df = pd.DataFrame(klines.json())

    klines_df.columns = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_asset_volume',
                         'number_of_trades', 'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore']
    klines_df['open_time'] = pd.to_datetime(klines_df['open_time'], unit='ms')
    klines_df['close_time'] = pd.to_datetime(klines_df['close_time'], unit='ms')
    for col in ['open', 'high', 'low', 'close', 'volume']:
        klines_df[col] = klines_df[col].apply(lambda x: float(x))

    klines_df = klines_df[['open_time', 'close_time', 'volume', 'open', 'close']]
    klines_df['diff'] = klines_df['close'] - klines_df['open']

    return klines_df


def get_crypto_news_from_cryptopanic(
        keyword: str,
        start_date: str,
        end_date: str,
        kind: str = 'news',
        filter: str = 'important',
) -> pd.DataFrame:
    url = 'https://cryptopanic.com/api/v1/posts/'
    params = {
        'auth_token': os.environ["CRYPTOPANIC_API_KEY"],
        'currencies': keyword,
        'kind': kind,
        'filter': filter
    }

    data = requests.get(url, params=params).json()
    result = data['results']

    for i in range(data['count']):
        url = data['next']

        if url is None:
            break

        data = requests.get(url).json()
        result.extend(data['results'])

    news_df = pd.DataFrame(result)
    for col in ['positive', 'important', 'liked']:
        news_df[col] = news_df['votes'].apply(lambda x: x[col])

    news_df = news_df[['published_at', 'title', 'positive', 'important', 'liked']]
    news_df['published_at'] = pd.to_datetime(news_df['published_at']).dt.tz_convert(None)

    news_df['date'] = news_df['published_at'].apply(lambda x: str(x)[:10])
    news_df = news_df.drop(columns=['published_at'])
    news_df = news_df[(start_date <= news_df['date']) & (news_df['date'] <= end_date)]

    return news_df.sort_values(['date', 'important'], ascending=False)


In [4]:
tiket = "BTC"

end_date = '2025-04-17'
start_date = add_days(end_date, -7)
news_df = get_crypto_news_from_cryptopanic(tiket, start_date, end_date)
klines_df = get_klines(tiket + 'USDT', start_date, end_date)

In [5]:
news_df

Unnamed: 0,title,positive,important,liked,date
1,Ethereum Is What Bitcoin Was Meant to Be,8,5,8,2025-04-16
0,Family offices show stronger preference for Et...,5,3,6,2025-04-16
2,"MicroStrategy Acquires 3,459 Bitcoins for $285...",8,5,6,2025-04-14
3,Saylor signals new Bitcoin buy after Strategy ...,6,4,5,2025-04-13
4,Sweden: proposal to include Bitcoin in nationa...,16,5,8,2025-04-11
5,"Crypto Market Rises Sharply, but Experts Warn ...",0,4,0,2025-04-10
6,Charles Hoskinson sees Bitcoin hitting $250K a...,5,3,4,2025-04-10


In [6]:
klines_df

Unnamed: 0,open_time,close_time,volume,open,close,diff
0,2025-04-10,2025-04-10 23:59:59.999,33284.80718,82615.22,79607.3,-3007.92
1,2025-04-11,2025-04-11 23:59:59.999,34435.43797,79607.3,83423.84,3816.54
2,2025-04-12,2025-04-12 23:59:59.999,18470.74437,83423.83,85276.9,1853.07
3,2025-04-13,2025-04-13 23:59:59.999,24680.04181,85276.91,83760.0,-1516.91
4,2025-04-14,2025-04-14 23:59:59.999,28659.09348,83760.0,84591.58,831.58
5,2025-04-15,2025-04-15 23:59:59.999,20910.99528,84591.58,83643.99,-947.59
6,2025-04-16,2025-04-16 23:59:59.999,20867.24519,83643.99,84030.38,386.39


In [9]:
import os
import openai
from openai import OpenAI
import numpy as np

openai.api_key = os.environ["OPENAI_API_KEY"]

client = OpenAI()

response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt="write strictly 'Yes' or 'No' to the following question: Will bitcoin exceed $80k today?\nAnswer: ",
    max_tokens=5,
    temperature=0,
    stream=False,
    logprobs=20,
)

In [29]:
def get_probability_from_openai(
        prompt,
        model='gpt-3.5-turbo-instruct',
        temperature=0,
        max_tokens=5,
        logprobs=20,
):
    
    client = OpenAI()
    response = client.completions.create(
        model=model,
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        stream=False,
        logprobs=logprobs,
    )
    return response


def extract_yes_no_logits_and_softmax(response):
    yes_logits = []
    no_logits = []
    yes_tokens = []
    no_tokens = []

    top_logprobs = response.choices[0].logprobs.top_logprobs[0]
    for token, logit in top_logprobs.items():
        processed_token = token.lower().strip()
        if processed_token == 'yes':
            yes_logits.append(logit)
            yes_tokens.append(token)
        elif processed_token == 'no':
            no_logits.append(logit)
            no_tokens.append(token)

    relevant_logits = np.array(yes_logits + no_logits)

    if relevant_logits.size == 0:
        return Exception("No relevant logits found for 'yes'S or 'no' tokens.")

    exp_logits = np.exp(relevant_logits - np.max(relevant_logits))
    softmax_probs = exp_logits / exp_logits.sum()

    yes_prob_sum = float(np.sum(softmax_probs[:len(yes_logits)])) if yes_logits else 0.0
    no_prob_sum = float(np.sum(softmax_probs[len(yes_logits):])) if no_logits else 0.0

    return {
        "yes_tokens_found": yes_tokens,
        "no_tokens_found": no_tokens,
        "yes_prob_sum": yes_prob_sum,
        "no_prob_sum": no_prob_sum,
        "softmax_probs_yes_no": softmax_probs.tolist()
    }


def create_prompt(question, news_df, klines_df) -> str:
    return """
    Use following data to answer on question:
    {df_str}

    Write strictly 'Yes' or 'No' to the following question: Will be {question} \nAnswer:
    """.format(
        df_str='\n\n'.join([news_df.to_markdown(), klines_df.to_markdown()]),
        question=question
    )
    

In [31]:
response = get_probability_from_openai(
    prompt=create_prompt("Bitcoin Up or Down on April 17", news_df, klines_df),
    model="gpt-3.5-turbo-instruct",
    temperature=0,
    max_tokens=5,
    logprobs=20,
)

response

Completion(id='cmpl-BO2U222F59qBi9bVczqPSzTHNfyxI', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=Logprobs(text_offset=[2613, 2614], token_logprobs=[-1.1950037, -0.47618958], tokens=['\n', 'Yes'], top_logprobs=[{'\n': -1.1950037, 'Yes': -1.5026782, ' Yes': -1.6873782, 'No': -2.3182929, ' No': -2.3688662, '\n\n': -3.4151251, " '": -4.3528595, ' YES': -5.1174183, 'YES': -5.2452717, "'": -5.4385214, ' NO': -5.6083364, 'NO': -6.0426416, ' yes': -6.16891, '\n\n\n\n': -6.2235107, 'Down': -6.4152784, '\t': -6.435355, '<|endoftext|>': -6.608536, ' Down': -6.6215734, ' ': -6.6562214, ' Up': -6.872181}, {'Yes': -0.47618958, 'No': -1.1274023, 'Down': -4.3149824, '   ': -4.958747, ' Yes': -5.1461153, 'Up': -5.2175207, ' No': -6.0258493, 'It': -6.363839, 'YES': -6.3905897, 'NO': -6.621976, "'": -6.6330805, '\n': -6.888633, 'The': -6.9108496, 'I': -7.024621, '    ': -7.3302107, '       ': -7.4851522, 'Cannot': -7.493106, 'Based': -7.6832037, 'Unknown': -7.906658, 'yes': -7.957679

In [32]:
extract_yes_no_logits_and_softmax(response)

{'yes_tokens_found': ['Yes', ' Yes', ' YES', 'YES', ' yes'],
 'no_tokens_found': ['No', ' No', ' NO', 'NO'],
 'yes_prob_sum': 0.679997119112075,
 'no_prob_sum': 0.320002880887925,
 'softmax_probs_yes_no': [0.35952474315135635,
  0.2988922083976423,
  0.0096798160824502,
  0.008518067546816032,
  0.0033822839338102,
  0.15904199730888186,
  0.15119872049290398,
  0.005924668886716314,
  0.0038374941994227885]}

In [None]:
# Benchmark

In [None]:
tiket = "BTC"

end_date = '2025-04-17'
start_date = add_days(end_date, -7)
news_df = get_crypto_news_from_cryptopanic(tiket, start_date, end_date)
klines_df = get_klines(tiket + 'USDT', start_date, end_date)

response = get_probability_from_openai(
    prompt=create_prompt("Bitcoin Up or Down on April 17", news_df, klines_df),
    model="gpt-3.5-turbo-instruct",
    temperature=0,
    max_tokens=5,
    logprobs=20,
)

extract_yes_no_logits_and_softmax(response)

In [36]:
from tqdm import tqdm

In [40]:
tiket = "BTC"
days = range(1, 22)
end_date_init = "2025-03-31"
result = []


for d in tqdm(days):
    end_date = add_days(end_date_init, d - 1)
    start_date = add_days(end_date, -7)

    news_df = get_crypto_news_from_cryptopanic(tiket, start_date, end_date)
    klines_df = get_klines(tiket + 'USDT', start_date, end_date)

    response = get_probability_from_openai(
        prompt=create_prompt(f"Bitcoin Up or Down on April {d}", news_df, klines_df),
        model="gpt-3.5-turbo-instruct",
        temperature=0,
        max_tokens=5,
        logprobs=20,
    )

    r = extract_yes_no_logits_and_softmax(response)
    r['day'] = f'april-{d}'

    result.append(r)

len(result)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [01:40<00:00,  4.78s/it]


21

In [45]:
y = {
 'april-1': 'Down',
 'april-2': 'Up',
 'april-3': 'Up',
 'april-4': 'Down',
 'april-5': 'Up',
 'april-6': 'Up',
 'april-7': 'Up',
 'april-8': 'Up',
 'april-9': 'Up',
 'april-10': 'Down',
 'april-11': 'Down',
 'april-12': 'Down',
 'april-13': 'Up',
 'april-14': 'Down',
 'april-15': 'Up',
 'april-16': 'Up',
 'april-17': 'Up',
 'april-18': 'Down',
 'april-19': 'Down',
 'april-20': 'Up',
 'april-21': 'Down'
}

result_df = pd.DataFrame(result)
result_df['y'] = result_df['day'].apply(lambda x: y[x])
result_df

Unnamed: 0,yes_tokens_found,no_tokens_found,yes_prob_sum,no_prob_sum,softmax_probs_yes_no,day,y
0,"[ Yes, Yes, YES, yes, YES]","[ No, No, NO]",0.645179,0.354821,"[0.4354424578370576, 0.19340746066137687, 0.00...",april-1,Down
1,"[Yes, Yes, YES, YES, yes]","[No, No, NO, NO]",0.681327,0.318673,"[0.3469003954414283, 0.31357297054712396, 0.00...",april-2,Up
2,"[ Yes, Yes, YES, YES, yes]","[ No, No, NO, NO]",0.712121,0.287879,"[0.3546469891113959, 0.34215532405521615, 0.00...",april-3,Up
3,"[ Yes, Yes, YES, yes]","[ No, No, NO, NO]",0.428563,0.571437,"[0.2727432789007448, 0.14604105523701125, 0.00...",april-4,Down
4,"[ Yes, Yes, YES, YES, yes]","[ No, No, NO, NO]",0.506604,0.493396,"[0.26781782215558586, 0.2241503610355936, 0.00...",april-5,Up
5,"[ Yes, Yes, YES, YES]","[ No, No, NO, NO]",0.643891,0.356109,"[0.3940595667426864, 0.23516866556173618, 0.01...",april-6,Up
6,"[ Yes, Yes, YES, YES, yes]","[ No, No, NO, NO]",0.580168,0.419832,"[0.34728650179691567, 0.21543529147506663, 0.0...",april-7,Up
7,"[Yes, Yes, YES, YES, yes]","[No, No, NO, NO]",0.467425,0.532575,"[0.25429539864051004, 0.19979758479873086, 0.0...",april-8,Up
8,"[Yes, Yes, YES, YES, yes]","[No, No, NO, NO]",0.428116,0.571884,"[0.27608605621346644, 0.14011818078675356, 0.0...",april-9,Up
9,"[Yes, Yes, YES, YES, yes]","[No, No, NO, NO]",0.60263,0.39737,"[0.4032743368491571, 0.18360875586559675, 0.00...",april-10,Down


In [46]:
result_df.to_csv("BTC_22.csv", index=None)

In [50]:
from sklearn.metrics import roc_auc_score

roc_auc_score(result_df['y'], result_df['yes_prob_sum'])

0.5092592592592593