In [1]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# IMPORTS
import pandas as pd
import os
import yfinance as yf
from transformers import RobertaTokenizer, RobertaForSequenceClassification, PegasusForConditionalGeneration, PegasusTokenizer
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, classification_report

In [30]:
import numpy as np

In [6]:
# 1. aggregating all the cleaned data
text_directory_path = '/content/drive/My Drive/CSCI-499 Project/summarized_text/'

In [5]:
count = 0

In [24]:
def summarized_text(file_path):
  file_name = file_path.split('/')[-1]
  date_str = file_name.split('.txt')[0].split('fdata')[-1]
  date = pd.to_datetime(date_str, format='%Y-%m-%d')
  # print(file_path)
  file_txt = open(file_path, "r")
  content = file_txt.read()
  return {'Date': date, 'SummarizedText': content}

In [25]:
summarized_dict = []

for file_name in os.listdir(text_directory_path):
  summarized_dict.append(summarized_text(os.path.join(text_directory_path, file_name)))


summarized_df = pd.DataFrame(summarized_dict, columns=['Date', 'SummarizedText'])

In [26]:
summarized_df

Unnamed: 0,Date,SummarizedText
0,2021-05-19,and the perfect tfsa stock for 2021 tfsa stock...
1,2021-05-20,the number of airports and the tenth largest b...
2,2021-05-21,from the production of this event with 1 00 fr...
3,2021-05-22,in 2011 and emi music in 2011 universal music ...
4,2021-05-23,n m virgin galactic on saturday made its first...
...,...,...
597,2019-01-09,amy schumer and anne hathaway are out and marg...
598,2019-01-10,advertisement 4 story continues below this adv...
599,2019-01-11,huntsman had envisioned for the organization h...
600,2019-01-12,u s tariffs and flagging domestic demand even ...


In [27]:
summarized_df.sort_values(by='Date', inplace=True)
summarized_df.reset_index(drop=True, inplace=True)

In [7]:
def aggregate_text(file_path):
  file_name = file_path.split('/')[-1]
  date_str = file_name.split('fdata')[1].split('_')[0]
  date = pd.to_datetime(date_str, format='%Y-%m-%d')

  try:
    df = pd.read_csv(file_path)
    text = ' '.join(df['text'].astype(str))
    return {'Date': date, 'ConcatenatedText': text}

  except:
    return {'Date': date, 'ConcatenatedText': None}

In [None]:
count = 0
aggregated_data = []
print('Total files in new_text directory - ', len(os.listdir(text_directory_path)))
for file_name in os.listdir(text_directory_path):
  # if count > 20:
  #   break
  aggregated_data.append(aggregate_text(os.path.join(text_directory_path, file_name)))
  # count += 1

combined_df = pd.DataFrame(aggregated_data, columns=['Date', 'ConcatenatedText'])
combined_df.sort_values(by='Date', inplace=True)
combined_df.reset_index(drop=True, inplace=True)
print(count)
# combined_df

In [6]:
combined_df

Unnamed: 0,Date,ConcatenatedText
0,2019-01-01,advertisement wall street ends dismal volatile...
1,2019-01-02,the rockies have signed outfielder michael sau...
2,2019-01-03,veracyte to present at the 37th annual j p mor...
3,2019-01-04,the rare warning of disappointing results from...
4,2019-01-05,us president donald trump speaks to reporters ...
...,...,...
637,2022-12-26,here are the highlights of events related to t...
638,2022-12-27,both dr mayank amin and his wife are pharmacis...
639,2022-12-28,billionaires have had a bad year globally the ...
640,2022-12-29,ein news lafox ill dec 28 2022 globe newswire ...


In [None]:
# Merging VIX data with summarized text
vix_ticker = "^VIX"
vix = yf.Ticker(vix_ticker)
vix_data = vix.history(start="2019-01-01", end="2022-12-31")
vix_data.reset_index(inplace=True)
vix_data['Date'] = pd.to_datetime(vix_data['Date'], format='%Y-%m-%d').dt.tz_localize(None)
summarized_df = pd.merge(summarized_df, vix_data[['Date', 'Close']], on='Date', how='inner')
summarized_df.rename(columns={'Close': 'VIX_Close'}, inplace=True)
summarized_df

In [31]:
summarized_df['VIX_Close'] = np.log(summarized_df['VIX_Close'])

In [None]:
summarized_df

In [None]:
summarized_df

In [36]:
summarized_df.to_csv('/content/drive/My Drive/CSCI-499 Project/summarized_df.csv', index=False)

In [37]:
df = pd.read_csv('/content/drive/My Drive/CSCI-499 Project/summarized_df.csv')

In [38]:
df

Unnamed: 0,Date,SummarizedText,VIX_Close
0,2019-01-02,to return to the blue jays on a minors deal ho...,3.145014
1,2019-01-03,m est links to the live webcasts of the presen...,3.236716
2,2019-01-04,a weak report on us manufacturing also weighed...,3.062456
3,2019-01-07,not to rely on these forward looking statement...,3.063391
4,2019-01-08,the u s and china spearheaded by a 7 6 percent...,3.018960
...,...,...,...
410,2022-12-23,actually call the continental air defence comm...,3.038313
411,2022-12-27,you happen to go to a store and you see ibupro...,3.075005
412,2022-12-28,net worth of nearly 139 billion as of dec 27 a...,3.097386
413,2022-12-29,leading global manufacturer of engineered solu...,3.065258


In [7]:
# Merging VIX data
vix_ticker = "^VIX"
vix = yf.Ticker(vix_ticker)
vix_data = vix.history(start="2019-01-01", end="2022-12-31")
vix_data.reset_index(inplace=True)
vix_data['Date'] = pd.to_datetime(vix_data['Date'], format='%Y-%m-%d').dt.tz_localize(None)
combined_df = pd.merge(combined_df, vix_data[['Date', 'Close']], on='Date', how='inner')
combined_df.rename(columns={'Close': 'VIX_Close'}, inplace=True)
combined_df

Unnamed: 0,Date,ConcatenatedText,VIX_Close
0,2019-01-02,the rockies have signed outfielder michael sau...,23.219999
1,2019-01-03,veracyte to present at the 37th annual j p mor...,25.450001
2,2019-01-04,the rare warning of disappointing results from...,21.379999
3,2019-01-07,loading loading gilead sciences inc gild and y...,21.400000
4,2019-01-08,a rise in the industrials and health care sect...,20.469999
...,...,...,...
439,2022-12-23,as children around the globe gear up for chris...,20.870001
440,2022-12-27,both dr mayank amin and his wife are pharmacis...,21.650000
441,2022-12-28,billionaires have had a bad year globally the ...,22.139999
442,2022-12-29,ein news lafox ill dec 28 2022 globe newswire ...,21.440001


In [None]:
combined_df.to_csv('/content/drive/My Drive/CSCI-499 Project/combined_df.csv', index=False)

In [None]:
null_values = combined_df['ConcatenatedText'].isnull().sum()
print(null_values)
combined_df['ConcatenatedText'].fillna('', inplace=True)
null_values = combined_df['ConcatenatedText'].isnull().sum()
print(null_values)

1
0


In [None]:
articles = combined_df['ConcatenatedText'].tolist()
vix_values = combined_df['VIX_Close'].tolist()

In [None]:
train_articles, val_articles, train_vix, val_vix = train_test_split(articles, vix_values, test_size=0.2, random_state=42)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(train_articles, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_articles, truncation=True, padding=True, max_length=512)

In [None]:
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_vix)
)

In [None]:
val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_vix)
)

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_batch_size = 4
val_batch_size = 8
epochs = 3
learning_rate = 1e-5

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()

In [None]:
for epoch in range(epochs):
    # Training
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            val_losses.append(val_loss.item())
    val_loss = sum(val_losses) / len(val_losses)
    print(f"Epoch {epoch+1}: Validation Loss: {val_loss}")

Epoch 1: Validation Loss: 77.31075564297763


KeyboardInterrupt: 

In [None]:
# Evaluation
predictions = []
targets = []
model.eval()
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions.extend(outputs.logits.flatten().tolist())
        targets.extend(labels.tolist())

# Calculate evaluation metrics
mse = mean_squared_error(targets, predictions)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 103.44546732231848


1.
train_batch_size = 8 <br/>
val_batch_size = 8 <br/>
epochs = 3 <br/>
learning_rate = 1e-5 <br/>

  ***Mean Squared Error: 339.8413336695064***

2. train_batch_size = 4 <br/>
val_batch_size = 8 <br/>
epochs = 3 <br/>
learning_rate = 1e-5 <br/>
    ***Mean Squared Error: 55.648982233405526***

3. train_batch_size = 4 <br/>
val_batch_size = 10 <br/>
epochs = 3 <br/>
learning_rate = 1e-5 <br/>

  ***Mean Squared Error: 103.44546732231848***

4. train_batch_size = 4 <br/>
val_batch_size = 12 <br/>
epochs = 3 <br/>
learning_rate = 1e-5 <br/>

  ***Mean Squared Error: 109.51466980402587***

In [None]:
## ROBERTA CLASSIFICATION

In [None]:
articles = combined_df['ConcatenatedText'].tolist()
vix_values = combined_df['VIX_Close'].tolist()

In [None]:
# Define volatility categories
# For example, low volatility: 6.47 < VIX_CLOSE < 12.95, medium volatility: 12.95 <= VIX < 18.32, high volatility: VIX >= 18.32
categories = pd.cut(vix_values, bins=[6.47, 12.95, 18.32, 36.68], labels=['Low', 'Medium', 'High']).tolist()

# Split data into train and validation sets
train_articles_classify, val_articles_classify, train_categories_classify, val_categories_classify = train_test_split(articles, categories, test_size=0.2, random_state=42)

In [None]:
val_articles_classify = [article if article is not None else '' for article in val_articles_classify]

In [None]:
# Tokenize articles
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings_classify = tokenizer(train_articles_classify, truncation=True, padding=True, max_length=512)

In [None]:
train_input_ids = torch.tensor(train_encodings_classify['input_ids'])
train_attention_mask = torch.tensor(train_encodings_classify['attention_mask'])
train_categories_tensor = torch.tensor(train_categories_classify)
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_categories_tensor)

ValueError: too many dimensions 'str'

In [None]:
for value in train_categories_classify:
    if not isinstance(value, (str, int)):
        print(f"Unexpected data type found: {type(value)}")

In [None]:
val_encodings_classify = tokenizer(val_articles_classify, truncation=True, padding=True, max_length=512)

In [None]:
print(train_categories_classify[:5])

['High', 'Medium', 'High', 'High', 'Medium']


In [None]:
# Convert encodings to PyTorch tensors
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings_classify['input_ids']),
    torch.tensor(train_encodings_classify['attention_mask']),
    torch.tensor(train_categories_classify.cat.codes.tolist())
)
val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(val_encodings_classify['input_ids']),
    torch.tensor(val_encodings_classify['attention_mask']),
    torch.tensor(val_categories_classify.cat.codes.tolist())
)

AttributeError: 'list' object has no attribute 'cat'

In [None]:
# Define model architecture
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(categories.cat.categories))

In [None]:
# Define training parameters
train_batch_size = 8
val_batch_size = 8
epochs = 3
learning_rate = 1e-5

In [None]:
# Define data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)

In [None]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
# Training loop
for epoch in range(epochs):
    # Training
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_losses = []
    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            val_loss = loss_fn(outputs.logits, labels)
            val_losses.append(val_loss.item())
            val_preds.extend(torch.argmax(outputs.logits, axis=1).tolist())
    val_loss = sum(val_losses) / len(val_losses)
    print(f"Epoch {epoch+1}: Validation Loss: {val_loss}")

In [None]:
# Evaluation
print("Classification Report:")
print(classification_report(val_categories.cat.codes.tolist(), val_preds, target_names=categories.cat.categories))

In [None]:
# SUMMARIZING DATA
model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [None]:
articles = combined_df['ConcatenatedText'].tolist()
vix_values = combined_df['VIX_Close'].tolist()

In [None]:
def generate_summary(article_text):
    # Tokenize input text
    input_ids = tokenizer(article_text, return_tensors="pt", max_length=1024, truncation=True).input_ids

    # Generate summary
    summary_ids = model.generate(input_ids, num_beams=4, min_length=30, max_length=100, early_stopping=True)

    # Decode summary
    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary_text

In [None]:
summaries = []
for index, row in combined_df.iterrows():
    article_text = row['ConcatenatedText']  # Assuming the column name containing article text is 'article_text'
    summary = generate_summary(article_text)
    print(summary)
    summaries.append(summary)

KeyboardInterrupt: 

In [None]:
print(len(summaries))

11


In [None]:
combined_df_subset = combined_df[:11]

In [None]:
combined_df_subset['summary'] = summaries

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df_subset['summary'] = summaries


In [None]:
combined_df_subset

Unnamed: 0,Date,ConcatenatedText,VIX_Close,summary
0,2021-05-19,tfsa stocks like alimentation couche tard tsx ...,22.18,tfsa stocks like alimentation couche tard tsx ...
1,2021-05-20,1q21 passenger traffic improves sequentially b...,20.67,1q21 passenger traffic improves sequentially b...
2,2021-05-21,prca event features top ranking male and femal...,20.15,prca event features top ranking male and femal...
3,2021-05-24,wall street closed mixed at the end of a volat...,18.4,Wall street closed mixed at the end of a volat...
4,2021-05-25,getty for decades grocery stores have followed...,18.84,getty for decades grocery stores have followed...
5,2021-05-26,new york new york weak economic data led to sl...,17.360001,new york new york weak economic data led to sl...
6,2021-05-27,pbs a dutch court on wednesday ordered royal d...,16.74,pbs a dutch court on wednesday ordered royal d...
7,2021-05-28,the united states movie theatre company s stoc...,16.76,the united states movie theatre company s stoc...
8,2021-06-01,mississauga on accesswire may 31 2021 redishre...,17.9,mississauga on accesswire may 31 2021 redishre...
9,2021-06-02,over her career dr kristen hege has come to se...,17.48,over her career dr kristen hege has come to se...


In [None]:
from collections import Counter

In [None]:
def extract_keywords(summary):
    tokenized_summary = tokenizer(summary, return_tensors="pt", max_length=512, truncation=True)
    keywords = tokenizer.decode(tokenized_summary['input_ids'][0], skip_special_tokens=True).split()
    return keywords

In [None]:
def analyze_keyword_frequency(keywords):
    keyword_counts = Counter(keywords)
    return keyword_counts

In [None]:
for index, row in combined_df_subset.iterrows():
  keywords = extract_keywords(row['summary'])
  keyword_frequency = analyze_keyword_frequency(keywords)
  print(keyword_frequency)

Counter({'the': 4, 'stocks': 3, 'tfsa': 2, 'tsx': 2, 'atd': 2, 'should': 2, 'be': 2, 'your': 2, 'an': 2, 'and': 2, 'economy': 2, 'of': 2, 'are': 2, 'that': 2, 'like': 1, 'alimentation': 1, 'couche': 1, 'tard': 1, 'a': 1, 'b': 1, 'on': 1, 'watch': 1, 'list': 1, 'for': 1, '2021': 1, 'stock': 1, 'market': 1, 'hit': 1, 'all': 1, 'time': 1, 'high': 1, 'earlier': 1, 'this': 1, 'year': 1, 'since': 1, 'then': 1, 'excitement': 1, 'has': 1, 'waned': 1, 'several': 1, 'growth': 1, 'have': 1, 'declined': 1, 'precipitously': 1, 'meanwhile': 1, 'is': 1, 'reopening': 1, 'certain': 1, 'sectors': 1, 'expecting': 1, 'explosion': 1, 'pent': 1, 'up': 1, 'demand': 1, 'with': 1, 'in': 1, 'mind': 1, 'tax': 1, 'free': 1, 'savings': 1, 'account': 1, 'probably': 1, 'diverted': 1, 'to': 1})
Counter({'the': 4, 'by': 2, 'airports': 2, 's': 2, 'caap': 2, 'of': 2, 'results': 2, 'financial': 2, 'are': 2, 'in': 2, '1q21': 1, 'passenger': 1, 'traffic': 1, 'improves': 1, 'sequentially': 1, 'but': 1, 'remains': 1, 'signif