In [3]:
!pip install newspaper3k fake-useragent

import requests
from bs4 import BeautifulSoup
import re
from newspaper import Article
from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor

def clean_text(text):
    """Clean up the text by removing non-alphanumeric characters and excessive whitespace."""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets (often citations or links)
    text = re.sub('<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    return text.strip()  # Remove leading and trailing whitespace

def scrape_and_clean_news(urls, max_workers=10):
    """Scrape and clean text from a list of news article URLs using concurrent requests."""
    def fetch_and_process(url):
        try:
            user_agent = UserAgent().random
            headers = {'User-Agent': user_agent}
            article = Article(url)
            article.download(input_html=requests.get(url, headers=headers, timeout=10).text)
            article.parse()
            return clean_text(article.text)
        except Exception as e:
            print(f"Failed to process {url}: {e}")
            return None

    cleaned_texts = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(fetch_and_process, urls)
        cleaned_texts = [result for result in results if result is not None]

    return cleaned_texts

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fake-useragent
  Downloading fake_useragent-1.4.0-py3-none-any.whl (15 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.1-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.7/97.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?2

In [5]:
import pandas as pd

file_path = '/content/filtered_data.xlsx'
df = pd.read_excel(file_path)

import re

#find URLs
url_pattern = r'https?://[^\s]+'

extracted_urls = []

for block in df.iloc[:, 0]:
    for cell in block.split(','):
      found_urls = re.findall(url_pattern, cell)
      extracted_urls.extend(found_urls)  # Add found URLs

extracted_urls[:5]

['https://www.wbaltv.com/article/wall-street-ends-dismal-volatile-year-on-a-bright-note/25716320',
 'https://www.newyorktelegraph.com/news/258740402/rice-shortages-hamper-recovery-efforts-in-laos',
 'https://magicvalley.com/community/mini-cassia/news/burley-woman-accused-of-intimidating-a-witness/article_767c9a0d-909d-5603-8509-d26882e04e48.html',
 'https://www.investors.com/research/ibd-industry-themes/growth-stocks-2018-cannabis-stock-market/',
 'https://www.digitaltrends.com/home/escape-tiny-home-airbnb-rentals/']

In [6]:
cleaned_articles = scrape_and_clean_news(extracted_urls)

Failed to process https://www.laosnews.net/news/258740402/rice-shortages-hamper-recovery-efforts-in-laos: You must `download()` an article first!
Failed to process https://www.newyorktelegraph.com/news/258740402/rice-shortages-hamper-recovery-efforts-in-laos: You must `download()` an article first!




Failed to process https://english.vietnamnet.vn/fms/business/215051/improving-business-environment-is-government-s-priority-in-2019.html: HTTPSConnectionPool(host='english.vietnamnet.vn', port=443): Max retries exceeded with url: /fms/business/215051/improving-business-environment-is-government-s-priority-in-2019.html (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7cccb73dde40>: Failed to resolve 'english.vietnamnet.vn' ([Errno -2] Name or service not known)"))
Failed to process https://www.orlandoecho.com/news/258741486/us-stocks-farewell-2018-with-solid-rise: You must `download()` an article first!
Failed to process https://www.australiannews.net/news/258740402/rice-shortages-hamper-recovery-efforts-in-laos: You must `download()` an article first!
Failed to process https://www.sturgisjournal.com/zz/news/20181231/animal-advocates-fatal-zoo-mauling-shows-need-for-crackdown: You must `download()` an article first!
Failed to process https://www.newyorksta

In [7]:
#%run 'https://drive.google.com/file/d/15rat-8XV-uYsdHkLl-H8EozGfagMVT9x/view?usp=sharing'
import yfinance as yf
import pandas as pd

# Define the VIX ticker symbol
vix_ticker = "^VIX"

# Create a Ticker object for the VIX
vix = yf.Ticker(vix_ticker)

# Fetch historical data for the VIX from its earliest available date to the present
vix_data = vix.history(start="2019-01-01")

In [8]:
vix_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-02 00:00:00-06:00,27.540001,28.530001,23.049999,23.219999,0,0.0,0.0
2019-01-03 00:00:00-06:00,25.680000,26.600000,24.049999,25.450001,0,0.0,0.0
2019-01-04 00:00:00-06:00,24.360001,24.480000,21.190001,21.379999,0,0.0,0.0
2019-01-07 00:00:00-06:00,22.059999,22.709999,20.910000,21.400000,0,0.0,0.0
2019-01-08 00:00:00-06:00,20.959999,22.030001,20.090000,20.469999,0,0.0,0.0
...,...,...,...,...,...,...,...
2024-02-28 00:00:00-06:00,13.520000,13.900000,13.440000,13.840000,0,0.0,0.0
2024-02-29 00:00:00-06:00,14.140000,14.150000,13.300000,13.400000,0,0.0,0.0
2024-03-01 00:00:00-06:00,13.340000,13.660000,13.080000,13.110000,0,0.0,0.0
2024-03-04 00:00:00-06:00,13.490000,13.580000,13.320000,13.490000,0,0.0,0.0


In [16]:
cleaned_articles

['advertisement wall street ends dismal volatile year on a bright note share copy link copy wall street closed out a dismal turbulent year for stocks on a bright note monday but still finished 2018 with the worst showing in a decade after setting a series of records through the late summer and early fall major u s indexes fell sharply after early october leaving them all in the red for the year the s p 500 index the market s main benchmark finished the year with a loss of 6 2 percent the last time the index fell for the year was in 2008 during the financial crisis the s p 500 also posted tiny losses in 2011 and 2015 but eked out small gains in both years once dividends were included the dow jones industrial average declined 5 6 percent the nasdaq composite sank 12 2 percent major indexes in europe also ended 2018 in the red the cac 40 of france finished the year down 11 percent britain s ftse 100 lost 12 5 percent germany s dax ended the year in a bear market down 22 percent from a hig

In [15]:
vix_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-01-02 00:00:00-06:00,17.240000,17.240000,17.240000,17.240000,0,0.0,0.0
1990-01-03 00:00:00-06:00,18.190001,18.190001,18.190001,18.190001,0,0.0,0.0
1990-01-04 00:00:00-06:00,19.219999,19.219999,19.219999,19.219999,0,0.0,0.0
1990-01-05 00:00:00-06:00,20.110001,20.110001,20.110001,20.110001,0,0.0,0.0
1990-01-08 00:00:00-06:00,20.260000,20.260000,20.260000,20.260000,0,0.0,0.0
...,...,...,...,...,...,...,...
2024-02-28 00:00:00-06:00,13.520000,13.900000,13.440000,13.840000,0,0.0,0.0
2024-02-29 00:00:00-06:00,14.140000,14.150000,13.300000,13.400000,0,0.0,0.0
2024-03-01 00:00:00-06:00,13.340000,13.660000,13.080000,13.110000,0,0.0,0.0
2024-03-04 00:00:00-06:00,13.490000,13.580000,13.320000,13.490000,0,0.0,0.0


In [20]:
vix_data['Close'].head(132) #treat the closing value for the day as the vix for the day

Date
1990-01-02 00:00:00-06:00    17.240000
1990-01-03 00:00:00-06:00    18.190001
1990-01-04 00:00:00-06:00    19.219999
1990-01-05 00:00:00-06:00    20.110001
1990-01-08 00:00:00-06:00    20.260000
                               ...    
1990-07-03 00:00:00-05:00    16.110001
1990-07-05 00:00:00-05:00    17.250000
1990-07-06 00:00:00-05:00    16.410000
1990-07-09 00:00:00-05:00    16.660000
1990-07-10 00:00:00-05:00    16.920000
Name: Close, Length: 132, dtype: float64

In [None]:
cleaned

In [64]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

In [65]:
class MyBERTRegressionModel(nn.Module):
    def __init__(self, input_size, hidden_size=768):
        super(MyBERTRegressionModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        output = self.linear(embedded.sum(dim=1))
        return output

In [82]:
vix_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-01-02 00:00:00-06:00,17.240000,17.240000,17.240000,17.240000,0,0.0,0.0
1990-01-03 00:00:00-06:00,18.190001,18.190001,18.190001,18.190001,0,0.0,0.0
1990-01-04 00:00:00-06:00,19.219999,19.219999,19.219999,19.219999,0,0.0,0.0
1990-01-05 00:00:00-06:00,20.110001,20.110001,20.110001,20.110001,0,0.0,0.0
1990-01-08 00:00:00-06:00,20.260000,20.260000,20.260000,20.260000,0,0.0,0.0
...,...,...,...,...,...,...,...
2024-02-28 00:00:00-06:00,13.520000,13.900000,13.440000,13.840000,0,0.0,0.0
2024-02-29 00:00:00-06:00,14.140000,14.150000,13.300000,13.400000,0,0.0,0.0
2024-03-01 00:00:00-06:00,13.340000,13.660000,13.080000,13.110000,0,0.0,0.0
2024-03-04 00:00:00-06:00,13.490000,13.580000,13.320000,13.490000,0,0.0,0.0


In [85]:
vix_data['Date'] = pd.to_datetime(vix_data['Date'])
# vix_data = pd.to_datetime(vix_data['Date'])
# vix_data[vix_data['Date'] == '2019-01-01']

KeyError: 'Date'

In [66]:
texts = cleaned_articles

target_values = (vix_data['Close'].head(132)).astype(float)

In [14]:
len(target_values)

5

In [67]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')


dataset = TensorDataset(tokenized_texts['input_ids'], tokenized_texts['attention_mask'], torch.tensor(target_values).to(torch.float32))


dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [60]:
dataset

<torch.utils.data.dataset.TensorDataset at 0x7adcb00df8e0>

In [68]:

model = MyBERTRegressionModel(input_size=len(tokenizer), hidden_size=768)
model = model.float()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [69]:
import logging

logging.basicConfig(level=logging.INFO)

In [71]:
num_epochs = 3
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids, attention_mask, target = batch

        optimizer.zero_grad()
        model = model.float()

        outputs = model(input_ids, attention_mask).to(torch.float32)
        loss = criterion(outputs.to(torch.float32), target.unsqueeze(1)).to(torch.float32)
        #print(type(loss))
        loss.backward()
        optimizer.step()

        print(f'Epoch [Loss: {loss.item():.4f}')
# Save the trained model
torch.save(model.state_dict(), '.bert_regression_model.pth')

Epoch [Loss: 4560.7183
Epoch [Loss: 1035.6970
Epoch [Loss: 2775.7727
Epoch [Loss: 103.8867
Epoch [Loss: 4814.8374
Epoch [Loss: 1032.6765
Epoch [Loss: 36.5296
Epoch [Loss: 281.7906
Epoch [Loss: 325.9805
Epoch [Loss: 1712.2533
Epoch [Loss: 719.1298
Epoch [Loss: 1132.3887
Epoch [Loss: 21.7952
Epoch [Loss: 1423.1555
Epoch [Loss: 2522.5264
Epoch [Loss: 1781.7544
Epoch [Loss: 153.2039
Epoch [Loss: 4027.5720
Epoch [Loss: 652.9476
Epoch [Loss: 1042.8431
Epoch [Loss: 70.4704
Epoch [Loss: 1309.8129
Epoch [Loss: 4678.8984
Epoch [Loss: 81.2897
Epoch [Loss: 20.5417
Epoch [Loss: 47.1202
Epoch [Loss: 2994.8352
Epoch [Loss: 16.3250
Epoch [Loss: 1850.5646
Epoch [Loss: 3130.4358
Epoch [Loss: 1285.5687
Epoch [Loss: 1274.1641
Epoch [Loss: 18.3205
Epoch [Loss: 3244.5273
Epoch [Loss: 5986.1846
Epoch [Loss: 50.3540
Epoch [Loss: 2114.8518
Epoch [Loss: 27.0171
Epoch [Loss: 2539.0481
Epoch [Loss: 410.2063
Epoch [Loss: 106.0640
Epoch [Loss: 1269.2936
Epoch [Loss: 459.0123
Epoch [Loss: 206.7951
Epoch [Loss: 24.84