# Scraping the Stock Data for LLM Fine Tuning

In [4]:
!pip install yahoo_fin pyarrow requests_html lxml_html_clean beautifulsoup4 transformers torch

[0m

### Importing Models and Libraries

In [5]:
import requests
from bs4 import BeautifulSoup
from yahoo_fin import news
from transformers import pipeline
import warnings
import pandas as pd
import concurrent.futures
import unicodedata
import json
import re

#Initalize transformer pipelines
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", device=0)
summarizer = pipeline("summarization", device=0)
warnings.filterwarnings('ignore')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


## Defining Helper Functions

In [6]:
def basic_cleanup(text):
    '''Perform basic cleanup on article text'''
    cleaned_text = unicodedata.normalize('NFKD', text).encode('ascii','ignore').decode('utf-8', 'ignore')
    cleaned_text = re.sub(r'[^a-zA-Z0-9.,;:!?\'\"()\s]', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

In [7]:
def summarize_article(article_text):
    summary = summarizer(article_text[:2056], max_length=100, min_length=30, do_sample=False)
    return summary[0]['summary_text']

In [8]:
def add_sentiment(df:pd.DataFrame):
    setiments = df['article_text'].apply(lambda text: sentiment_pipeline(text[:2056])[0]['label'] if text != "N/A" else "UNKNOWN")
    df['sentiment'] = setiments
    return df

In [9]:
def get_article_text(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            paragprahs = soup.find_all('p')
            article_text = ' '.join([p.get_text() for p in paragprahs])
            article_text = basic_cleanup(article_text)
            return article_text
        else:
            print(f"Failed to get article text from {url} got a response code of {response.status_code}")
            return "N/A"
    except Exception as e:
        print(f"Failed to get article text from {url}: {e}")
        return "N/A"

## Fetching the Data from Yahoo Finance

In [10]:
stock_news = news.get_yf_rss("MSFT")
df = pd.DataFrame(stock_news)
df.head()

Unnamed: 0,summary,summary_detail,id,guidislink,links,link,published,published_parsed,title,title_detail
0,"Boosted by big demand and an AI twist, GoDaddy...","{'type': 'text/html', 'language': None, 'base'...",8a0f7629-dd32-3689-ac8d-fb86f169ef59,False,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://finance.yahoo.com/m/8a0f7629-dd32-3689...,"Thu, 17 Oct 2024 15:29:56 +0000","(2024, 10, 17, 15, 29, 56, 3, 291, 0)",This 'Big Daddy' Smashes All Mag 7 Stocks Exce...,"{'type': 'text/plain', 'language': None, 'base..."
1,The company has committed $1.6bn up to 2028 to...,"{'type': 'text/html', 'language': None, 'base'...",f8d948f2-e881-3b60-a580-c929dcf18451,False,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.power-technology.com/news/constell...,"Thu, 17 Oct 2024 14:59:53 +0000","(2024, 10, 17, 14, 59, 53, 3, 291, 0)",Constellation orders transformer to revive Thr...,"{'type': 'text/plain', 'language': None, 'base..."
2,"Philippe Laffont's hedge fund, Coatue Manageme...","{'type': 'text/html', 'language': None, 'base'...",a53fc391-27dd-3270-acb4-ffc1e0c2f87f,False,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.fool.com/investing/2024/10/17/bill...,"Thu, 17 Oct 2024 14:07:00 +0000","(2024, 10, 17, 14, 7, 0, 3, 291, 0)",Billionaire Philippe Laffont Continues to Buy ...,"{'type': 'text/plain', 'language': None, 'base..."
3,The American Forest Foundation will hold the U...,"{'type': 'text/html', 'language': None, 'base'...",2b9ac2ab-afb3-45ef-a183-1c08bdee8b62,False,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://finance.yahoo.com/news/first-us-auctio...,"Thu, 17 Oct 2024 13:54:03 +0000","(2024, 10, 17, 13, 54, 3, 3, 291, 0)",First US auction of nature-based carbon credit...,"{'type': 'text/plain', 'language': None, 'base..."
4,This stock can grant investors two benefits (g...,"{'type': 'text/html', 'language': None, 'base'...",0d91402b-c7be-3f9d-b9fc-61bda6ed770e,False,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.fool.com/investing/2024/10/17/here...,"Thu, 17 Oct 2024 10:45:00 +0000","(2024, 10, 17, 10, 45, 0, 3, 291, 0)",Here's My Top Artificial Intelligence (AI) Sto...,"{'type': 'text/plain', 'language': None, 'base..."


In [11]:
df.columns

Index(['summary', 'summary_detail', 'id', 'guidislink', 'links', 'link',
       'published', 'published_parsed', 'title', 'title_detail'],
      dtype='object')

## Preprocess the Data

In [12]:
def preprocess_data(df:pd.DataFrame) -> pd.DataFrame:
    df = df[['summary', 'link', 'published', 'title']]
    with concurrent.futures.ThreadPoolExecutor() as executor:
        article_texts = list(executor.map(get_article_text, df['link']))
    df['article_text'] = article_texts
    df = add_sentiment(df)
    return df

df_clean = preprocess_data(df)
df_clean.head()

Failed to get article text from https://finance.yahoo.com/video/election-cybersecurity-ai-healthcare-asking-225355609.html?.tsrc=rss got a response code of 400
Failed to get article text from https://finance.yahoo.com/video/amazon-invests-500m-nuclear-microsoft-210914709.html?.tsrc=rss got a response code of 400
Failed to get article text from https://finance.yahoo.com/video/mag-7-giants-continue-invest-221115292.html?.tsrc=rss got a response code of 400
Failed to get article text from https://finance.yahoo.com/m/8a0f7629-dd32-3689-ac8d-fb86f169ef59/this-%27big-daddy%27-smashes-all.html?.tsrc=rss got a response code of 404
Failed to get article text from https://finance.yahoo.com/m/a8d7ae12-1829-3dfd-aca0-243b0ffeb648/amazon-goes-nuclear%2C-sending.html?.tsrc=rss got a response code of 404
Failed to get article text from https://finance.yahoo.com/news/mistral-ai-introduces-edge-ai-192143615.html?.tsrc=rss got a response code of 404


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,summary,link,published,title,article_text,sentiment
0,"Boosted by big demand and an AI twist, GoDaddy...",https://finance.yahoo.com/m/8a0f7629-dd32-3689...,"Thu, 17 Oct 2024 15:29:56 +0000",This 'Big Daddy' Smashes All Mag 7 Stocks Exce...,,UNKNOWN
1,The company has committed $1.6bn up to 2028 to...,https://www.power-technology.com/news/constell...,"Thu, 17 Oct 2024 14:59:53 +0000",Constellation orders transformer to revive Thr...,The company has committed 1.6bn up to 2028 to ...,NEGATIVE
2,"Philippe Laffont's hedge fund, Coatue Manageme...",https://www.fool.com/investing/2024/10/17/bill...,"Thu, 17 Oct 2024 14:07:00 +0000",Billionaire Philippe Laffont Continues to Buy ...,"Founded in 1993, The Motley Fool is a financia...",POSITIVE
3,The American Forest Foundation will hold the U...,https://finance.yahoo.com/news/first-us-auctio...,"Thu, 17 Oct 2024 13:54:03 +0000",First US auction of nature-based carbon credit...,The American Forest Foundation (AFF) recently ...,NEGATIVE
4,This stock can grant investors two benefits (g...,https://www.fool.com/investing/2024/10/17/here...,"Thu, 17 Oct 2024 10:45:00 +0000",Here's My Top Artificial Intelligence (AI) Sto...,"Founded in 1993, The Motley Fool is a financia...",NEGATIVE


## Prepare the data for LLM Fine tuning

In [13]:
def prepare_finetuning_data(df: pd.DataFrame):
    training_data = []
    for _, row in df.iterrows():
        if row.get('sentiment') != "UNKNOWN":
            context = f"Title: {basic_cleanup(row['title'])}\nPublished: {row['published']}\nArticle: {row['article_text']}\nSentiment: {row['sentiment']}"
            question = "What is the sentiment of this article and why?"
            summary = summarize_article(row['article_text'])
            summary = basic_cleanup(summary)
            answer = f"The sentiment is {row['sentiment']} because: {summary}"
            training_data.append({"prompt" : f"{context}\nQuestion: {question}", "completion": answer})
    with open('ollama_training_data.jsonl', 'w') as f:
        json.dump(training_data, f, indent=4)

prepare_finetuning_data(df_clean)