In [17]:
# Importing requirements
import pandas as pd
from tqdm import tqdm

In [None]:
# Scraping Binance for news sentiments
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time


# Scraping function for Binance in n batches of m scrolls

def scrape_binance_news(batch_count=3, scrolls_per_batch=50, pause=2):
    chromedriver_path = "path to chromedriver local"
    options = Options()
    options.add_argument("start-maximized")
    options.add_argument("--headless") 

    service = Service(executable_path=chromedriver_path)
    driver = webdriver.Chrome(service=service, options=options)

    driver.get("https://www.binance.com/en/square/news/all")
    time.sleep(4)

    for batch in range(batch_count):
        print(f"Starting batch {batch + 1}/{batch_count}")
        for i in range(scrolls_per_batch):
            print(f"Scroll {i+1}/{scrolls_per_batch} in batch {batch + 1}")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(pause)

    # Waiting for at least one article to load
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href^="/en/square/post/"]'))
        )
    except:
        print("ews items did not appear within 10 seconds.")

    article_links = driver.find_elements(By.CSS_SELECTOR, 'a[href^="/en/square/post/"]')
    print(f"Found {len(article_links)} articles")

    data = []
    for link_elem in article_links:
        try:
            title = link_elem.text.strip()
            url = link_elem.get_attribute("href")
            if title and url:
                data.append({
                    "source": "Binance",
                    "headline": title,
                    "url": url
                })
                print(f"Collected: {title[:60]}...")
        except Exception as e:
            print(f"Error collecting article: {e}")
            continue

    driver.quit()

    df = pd.DataFrame(data)
    df = df.drop_duplicates(subset='url')
    return df

# Running scraper
print("Running scrape_binance_news in 3 batches of 50 scrolls each...")
df = scrape_binance_news()

# Save and show results
if not df.empty:
    df.to_csv("outputs/binance_news_articles_new.csv", index=False)
    print("Saved to binance_news_articles_new.csv")
    display(df.head())
else:
    print("No articles scraped.")


Running scrape_binance_news in 3 batches of 50 scrolls each...
📦 Starting batch 1/3
🔄 Scroll 1/50 in batch 1
🔄 Scroll 2/50 in batch 1
🔄 Scroll 3/50 in batch 1
🔄 Scroll 4/50 in batch 1
🔄 Scroll 5/50 in batch 1
🔄 Scroll 6/50 in batch 1
🔄 Scroll 7/50 in batch 1
🔄 Scroll 8/50 in batch 1
🔄 Scroll 9/50 in batch 1
🔄 Scroll 10/50 in batch 1
🔄 Scroll 11/50 in batch 1
🔄 Scroll 12/50 in batch 1
🔄 Scroll 13/50 in batch 1
🔄 Scroll 14/50 in batch 1
🔄 Scroll 15/50 in batch 1
🔄 Scroll 16/50 in batch 1
🔄 Scroll 17/50 in batch 1
🔄 Scroll 18/50 in batch 1
🔄 Scroll 19/50 in batch 1
🔄 Scroll 20/50 in batch 1
🔄 Scroll 21/50 in batch 1
🔄 Scroll 22/50 in batch 1
🔄 Scroll 23/50 in batch 1
🔄 Scroll 24/50 in batch 1
🔄 Scroll 25/50 in batch 1
🔄 Scroll 26/50 in batch 1
🔄 Scroll 27/50 in batch 1
🔄 Scroll 28/50 in batch 1
🔄 Scroll 29/50 in batch 1
🔄 Scroll 30/50 in batch 1
🔄 Scroll 31/50 in batch 1
🔄 Scroll 32/50 in batch 1
🔄 Scroll 33/50 in batch 1
🔄 Scroll 34/50 in batch 1
🔄 Scroll 35/50 in batch 1
🔄 Scroll 36/50 

Unnamed: 0,source,headline,url
0,Binance,Significant TRUMP Token Transfer to Centralize...,https://www.binance.com/en/square/post/04-29-2...
1,Binance,Trump Administration Engages in Expansive Trad...,https://www.binance.com/en/square/post/04-29-2...
2,Binance,Bitcoin Mining Sees Increased Use of Sustainab...,https://www.binance.com/en/square/post/04-29-2...
3,Binance,Prime Intellect Unveils Inference Stack Previe...,https://www.binance.com/en/square/post/04-29-2...
4,Binance,BlackRock Bitcoin ETF Narrows Gap with Strateg...,https://www.binance.com/en/square/post/04-29-2...


In [4]:
# extracting date out or url column and removing url column

#df['date'] = df['url'].str.slice(39, 49)
#df = df.drop(columns='url')
df.to_csv("outputs/binance_news_articles_new.csv", index=False)
df.head()

Unnamed: 0,source,headline,date
0,Binance,Significant TRUMP Token Transfer to Centralize...,04-29-2025
1,Binance,Trump Administration Engages in Expansive Trad...,04-29-2025
2,Binance,Bitcoin Mining Sees Increased Use of Sustainab...,04-29-2025
3,Binance,Prime Intellect Unveils Inference Stack Previe...,04-29-2025
4,Binance,BlackRock Bitcoin ETF Narrows Gap with Strateg...,04-29-2025


In [15]:
# Optional step: Combining latest news data with older one that scraped before

df = pd.read_csv('outputs/combined_news.csv')
df_new = pd.read_csv('outputs/binance_news_articles_new.csv')

# Converting to datetime

df_new['date'] = pd.to_datetime(df_new['date'], format='%m-%d-%Y', errors='coerce')
df_new = df_new.dropna(subset=['date'])
df_new['date'] = df_new['date'].dt.normalize()


# Sorting values by date
df = df.sort_values(by='date').reset_index(drop=True)
df_new = df_new.sort_values(by='date').reset_index(drop=True)

# Combining
df_comb = pd.concat([df, df_new], axis=0, ignore_index=True)
df_comb.to_csv('outputs/combined_news_final.csv', index=False)

In [18]:
# Analyzing sentiments  using FinBERT 
# Batching analysis to avoid memory crash
from utils.finbert_utils import analyze_finbert_batch

df = pd.read_csv("outputs/combined_news_final.csv")

batch_size = 16
results = []
for i in tqdm(range(0, len(df), batch_size)):
    batch = df['headline'].iloc[i:i+batch_size].tolist()
    batch_results = analyze_finbert_batch(batch)
    results.extend(batch_results)

df['finbert_sentiment'] = [r['label'] for r in results]
df['finbert_score'] = [r['score'] for r in results]
df['positive'] = df['finbert_sentiment'] == 'positive'
df['neutral'] = df['finbert_sentiment'] == 'neutral'
df['negative'] = df['finbert_sentiment'] == 'negative'

df.to_csv("outputs/Binance_news_with_finbert.csv", index=False)

  2%|▏         | 10/549 [00:03<02:28,  3.63it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 549/549 [02:21<00:00,  3.89it/s]
