In [None]:
import pandas as pd
import numpy as np
import os
import yfinance as yf
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

text_directory_path = 'CSCI499/new_text'

def aggregate_text(file_path):
  file_name = file_path.split('/')[-1]
  date_str = file_name.split('fdata')[1].split('_')[0]
  date = pd.to_datetime(date_str, format='%Y-%m-%d')

  try:
    df = pd.read_csv(file_path)
    text = ' '.join(df['text'].astype(str))
    # text = summarization_pipeline(text, max_length=100, min_length=10, do_sample=False)[0]['summary_text']
    # print("\n", text)
    return {'Date': date, 'ConcatenatedText': text}

  except:
    print('\nOOPS')
    return {'Date': date, 'ConcatenatedText': None}

count = 0
aggregated_data = []
print('Total files in new_text directory - ', len(os.listdir(text_directory_path)))
for file_name in os.listdir(text_directory_path):
  # if count > 20:
  #   break
  aggregated_data.append(aggregate_text(os.path.join(text_directory_path, file_name)))
  count += 1

combined_df = pd.DataFrame(aggregated_data, columns=['Date', 'ConcatenatedText'])
combined_df.sort_values(by='Date', inplace=True)
combined_df.reset_index(drop=True, inplace=True)
# combined_df

# Merging VIX data
vix_ticker = "^VIX"
vix = yf.Ticker(vix_ticker)
vix_data = vix.history(start="2019-01-01", end="2022-12-31")
vix_data.reset_index(inplace=True)
vix_data['Date'] = pd.to_datetime(vix_data['Date'], format='%Y-%m-%d').dt.tz_localize(None)
combined_df = pd.merge(combined_df, vix_data[['Date', 'Close']], on='Date', how='inner')
combined_df.rename(columns={'Close': 'VIX_Close'}, inplace=True)
print(combined_df)

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           num_labels=1)

null_values = combined_df['ConcatenatedText'].isnull().sum()
print(null_values)
combined_df['ConcatenatedText'].fillna('', inplace=True)
null_values = combined_df['ConcatenatedText'].isnull().sum()
print(null_values)

texts = combined_df['ConcatenatedText'].tolist()
vix_values = (np.log(combined_df['VIX_Close'])).tolist()

classes = []
for j in range(len(vix_values)):
  if vix_values[j] >= 6.47 and vix_values[j] < 12.95:
    classes.append('Low')
  elif vix_values[j] > 12.95 and vix_values[j] < 18.32:
    classes.append('Moderate')
  elif vix_values[j] > 18.32 and vix_values[j] < 36.68:
    classes.append('High')
  else:
    classes.append('Extreme')

train_articles, val_articles, train_vix, val_vix = train_test_split(texts, vix_values, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_articles, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_articles, truncation=True, padding=True, max_length=512)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_vix))
val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']), torch.tensor(val_encodings['attention_mask']), torch.tensor(val_vix))

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=True)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

#model = MyBERTRegressionModel(input_size=len(tokenizer), hidden_size=768)
model = model.float()
criterion = nn.MSELoss()
# criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

import logging

logging.basicConfig(level=logging.INFO)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, target = batch

        optimizer.zero_grad()
        #model = model.float()

        outputs = model(input_ids, attention_mask, labels=target)
        #outputs = model(input_ids, attention_mask)
        # loss = criterion(outputs.to(torch.float32), target.unsqueeze(1)).to(torch.float32)
        #print(type(loss))
        #loss = criterion(outputs, target)
        loss = outputs.loss
        #loss = criterion(outputs.logits, target)
        loss.backward()
        optimizer.step()

          #print(f'Epoch [Loss: {loss.item():.4f}')
    model.eval()
    with torch.no_grad():
      val_loss = 0
      for batch in val_dataloader:
        input_ids,attention_mask,target = batch
        #outputs = model(input_ids, attention_mask).to(torch.float32)
        outputs = model(input_ids, attention_mask, labels=target)
        #outputs = model(input_ids, attention_mask)
        #loss = criterion(outputs.to(torch.float32), target.unsqueeze(1)).to(torch.float32)
        loss = outputs.loss
        #loss = criterion(outputs, target)
        val_loss += loss.item()
    avg_val_loss = val_loss/len(val_dataloader)
    print(f'Epoch{epoch}[Validation Loss:{avg_val_loss}]')
# Save the trained model
torch.save(model.state_dict(), '.bert_regression_model.pth')

model.eval()  # Set the model to evaluation mode
total_mse = 0
num_samples = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, target = batch

        outputs = model(input_ids, attention_mask, labels=target)
        loss = outputs.loss  # Calculate loss
        total_mse += loss.item()  # Accumulate loss
        num_samples += len(input_ids)  # Update the total number of samples

# Calculate average MSE
avg_mse = total_mse / len(val_dataloader)

print(f"Validation MSE: {avg_mse}")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os
import yfinance as yf
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
text_directory_path = '/content/drive/MyDrive/CSCI-499 Project/new_text'

In [None]:
def aggregate_text(file_path):
  file_name = file_path.split('/')[-1]
  date_str = file_name.split('fdata')[1].split('_')[0]
  date = pd.to_datetime(date_str, format='%Y-%m-%d')

  try:
    df = pd.read_csv(file_path)
    text = ' '.join(df['text'].astype(str))
    # text = summarization_pipeline(text, max_length=100, min_length=10, do_sample=False)[0]['summary_text']
    # print("\n", text)
    return {'Date': date, 'ConcatenatedText': text}

  except:
    print('\nOOPS')
    return {'Date': date, 'ConcatenatedText': None}

In [None]:
count = 0
aggregated_data = []
print('Total files in new_text directory - ', len(os.listdir(text_directory_path)))
for file_name in os.listdir(text_directory_path):
  # if count > 20:
  #   break
  aggregated_data.append(aggregate_text(os.path.join(text_directory_path, file_name)))
  count += 1

combined_df = pd.DataFrame(aggregated_data, columns=['Date', 'ConcatenatedText'])
combined_df.sort_values(by='Date', inplace=True)
combined_df.reset_index(drop=True, inplace=True)
# combined_df

In [None]:
# Merging VIX data
vix_ticker = "^VIX"
vix = yf.Ticker(vix_ticker)
vix_data = vix.history(start="2019-01-01", end="2022-12-31")
vix_data.reset_index(inplace=True)
vix_data['Date'] = pd.to_datetime(vix_data['Date'], format='%Y-%m-%d').dt.tz_localize(None)
combined_df = pd.merge(combined_df, vix_data[['Date', 'Close']], on='Date', how='inner')
combined_df.rename(columns={'Close': 'VIX_Close'}, inplace=True)
print(combined_df)

In [None]:
# class MyBERTRegressionModel(nn.Module):
#     def __init__(self, input_size, hidden_size=768):
#         super(MyBERTRegressionModel, self).__init__()
#         self.embedding = nn.Embedding(input_size, hidden_size)
#         self.linear = nn.Linear(hidden_size, 1)

#     def forward(self, input_ids, attention_mask):
#         embedded = self.embedding(input_ids)
#         output = self.linear(embedded.sum(dim=1))
#         return output

In [None]:
# from transformers import BertTokenizer, BertModel
# model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           num_labels=1)

In [None]:
null_values = combined_df['ConcatenatedText'].isnull().sum()
print(null_values)
combined_df['ConcatenatedText'].fillna('', inplace=True)
null_values = combined_df['ConcatenatedText'].isnull().sum()
print(null_values)

In [None]:
texts = combined_df['ConcatenatedText'].tolist()
vix_values = (np.log(combined_df['VIX_Close']).tolist()

In [None]:
classes = []
for j in range(len(vix_values)):
  if vix_values[j] >= 6.47 and vix_values[j] < 12.95:
    classes.append('Low')
  elif vix_values[j] > 12.95 and vix_values[j] < 18.32:
    classes.append('Moderate')
  elif vix_values[j] > 18.32 and vix_values[j] < 36.68:
    classes.append('High')
  else:
    classes.append('Extreme')

In [None]:
classes

In [None]:
train_articles, val_articles, train_vix, val_vix, train_labels, val_labels = train_test_split(texts, vix_values, classes, test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_articles, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_articles, truncation=True, padding=True, max_length=512)


In [None]:
# train_labels = [classes[vix_values.index(value)] for value in train_vix]
# val_labels = [classes[vix_values.index(value)] for value in val_vix]

In [None]:
len(train_labels)
#torch.tensor(train_encodings['attention_mask']).shape

307

In [None]:
# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# train_labels_encoded = label_encoder.fit_transform(train_labels)
# val_labels_encoded = label_encoder.fit_transform(val_labels)

In [None]:
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels_encoded))

val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels_encoded))


In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [None]:
# train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_vix))
# val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']), torch.tensor(val_encodings['attention_mask']), torch.tensor(val_vix))

# train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
# val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=True)

In [None]:
#model = MyBERTRegressionModel(input_size=len(tokenizer), hidden_size=768)
#model = model.float()
#criterion = nn.MSELoss()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, target = batch

        optimizer.zero_grad()
        #model = model.float()

        #outputs = model(input_ids, attention_mask, labels=target)
        outputs = model(input_ids, attention_mask)
        # loss = criterion(outputs.to(torch.float32), target.unsqueeze(1)).to(torch.float32)
        #print(type(loss))
        #loss = criterion(outputs, target)
        #loss = outputs.loss
        loss = criterion(outputs.logits, target)
        loss.backward()
        optimizer.step()

          #print(f'Epoch [Loss: {loss.item():.4f}')
    model.eval()
    with torch.no_grad():
      val_loss = 0
      for batch in val_dataloader:
        input_ids,attention_mask,target = batch
        #outputs = model(input_ids, attention_mask).to(torch.float32)
        #outputs = model(input_ids, attention_mask, labels=target)
        outputs = model(input_ids, attention_mask)
        #loss = criterion(outputs.to(torch.float32), target.unsqueeze(1)).to(torch.float32)
        #loss = outputs.loss
        loss = criterion(outputs, target)
        val_loss += loss.item()
    avg_val_loss = val_loss/len(val_dataloader)
    print(f'Epoch{epoch}[Validation Loss:{avg_val_loss}]')
# Save the trained model
torch.save(model.state_dict(), '.bert_regression_model.pth')

In [None]:
model.eval()  # Set the model to evaluation mode
total_mse = 0
num_samples = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, target = batch

        outputs = model(input_ids, attention_mask, labels=target)
        loss = outputs.loss  # Calculate loss
        total_mse += loss.item()  # Accumulate loss
        num_samples += len(input_ids)  # Update the total number of samples

# Calculate average MSE
avg_mse = total_mse / len(val_dataloader)

print(f"Validation MSE: {avg_mse}")


1. Validation MSE: 423.5490132780636 </br>
Learning Rate = 1e-5 </br>
train batch_size = 8 </br>
val batch size = 2 </br>


2. Validation MSE: 311.59172973632815 </br>
Learning Rate = 1e-4 </br>
train batch_size = 8 </br>
val batch size = 8 </br>


3. Validation MSE: 275.0124104817708 </br>
Learning Rate = 1e-3 </br>
train batch_size = 16 </br>
val batch size = 16 </br>

4. Validation MSE: 4196.583740234375 </br>
Learning Rate = 1e-6 </br>
train batch_size = 32 </br>
val batch size = 32 </br>

4. Validation MSE: 394.6639862060547 </br>
Learning Rate = 1e-5 </br>
train batch_size = 16 </br>
val batch size = 16 </br>



**bold text** New Readings </br>

1. Validation MSE: 34.17598762512207 </br>
Learning Rate = 1e-4 </br>
train batch_size = 8 </br>
val batch size = 8 </br>

2.

In [None]:
# vix_data.reset_index(inplace=True)

In [None]:
# vix_data

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# vix_data.index

In [None]:
# print(vix_data.set_index(['Date'],inplace=True))

In [None]:
# (vix_data.index)

In [None]:
# import os
# import pandas as pd

# #NEW CODE ADDED BY VAIBHAV

# # Define directory containing CSV files
# #directory = '/content/drive/My Drive/CSCI-499 Project/text'
# directory = '/content/text'

# # Initialize an empty DataFrame to store the final data
# #final_df = pd.DataFrame(columns=['Date', 'ConcatenatedText'])
# res = []
# # Iterate over CSV files in the directory
# for filename in os.listdir(directory):
#     if filename.endswith('.csv'):
#         # Extract the date from the filename
#         #date_str = filename.split('_')[1].split('.')[0]
#         date_str = filename.split('fdata')[1].split('.')[0]
#         date = pd.to_datetime(date_str, format='%Y-%m-%d')


#         # Construct full file path
#         filepath = os.path.join(directory, filename)

#         # Load CSV file into a DataFrame
#         df = pd.read_csv(filepath)

#         # Concatenate all rows into a single string separated by the pound symbol
#         concatenated_text = '#'.join(df.iloc[:, 0].astype(str).tolist())
#         new_row = {'Date': [date], 'ConcatenatedText': [concatenated_text]}
#         # Append the date and concatenated text to the final DataFrame
#         #final_df = final_df.append(new_row, ignore_index=True)
#         res.append(new_row)
# final_df = pd.DataFrame(res, columns=['Date', 'ConcatenatedText'])
# # Sort the final DataFrame by date
# #final_df['Date'] = pd.to_datetime(final_df['Date'])
# final_df.sort_values(by='Date', inplace=True)

# # Reset the index of the final DataFrame
# final_df.reset_index(drop=True, inplace=True)

# import yfinance as yf
# import pandas as pd
# import os

# # Define the VIX ticker symbol
# vix_ticker = "^VIX"

# # Create a Ticker object for the VIX
# vix = yf.Ticker(vix_ticker)

# # Fetch historical data for the VIX
# vix_data = vix.history(start="2019-01-01", end="2020-03-02")

# # Reset the index of vix_data to convert the Date index into a column
# vix_data.reset_index(inplace=True)

# # Ensure the date format matches that of final_df and set it as the 'Date' column
# vix_data['Date'] = pd.to_datetime(vix_data['Date'], format='%Y-%m-%d')

# # Now merge the two DataFrames on the 'Date' column using an inner join
# # This will only keep rows that have matching dates in both DataFrames
# #final_df['Date'] = pd.to_datetime(final_df['Date'])
# final_df = pd.merge(final_df, vix_data[['Date', 'Close']], on='Date', how='inner')
# #final_df = pd.concat(final_df, vix_data[['Date', 'Close']], on='Date', how='inner')
# # Rename the 'Close' column to something more descriptive
# final_df.rename(columns={'Close': 'VIX_Close'}, inplace=True)

# # Your final_df now has an additional column 'VIX_Close' and only contains rows
# # for which both text data and VIX data are available.



In [None]:
# import os
# import pandas as pd

# # Define directory containing CSV files
# directory = '/content/text'

# # Initialize an empty list to store dictionaries
# data_list = []

# # Iterate over CSV files in the directory
# for filename in os.listdir(directory):
#     if filename.endswith('.csv'):
#         # Extract the date from the filename
#         date_str = filename.split('fdata')[1].split('.')[0]
#         date = pd.to_datetime(date_str, format='%Y-%m-%d')

#         # Construct full file path
#         filepath = os.path.join(directory, filename)

#         # Load CSV file into a DataFrame
#         df = pd.read_csv(filepath)

#         # Concatenate all rows into a single string separated by the pound symbol
#         concatenated_text = '#'.join(df.iloc[:, 0].astype(str).tolist())

#         # Append the date and concatenated text to the list
#         data_list.append({'Date': date, 'ConcatenatedText': concatenated_text})

# # Convert the list of dictionaries to a DataFrame
# final_df = pd.DataFrame(data_list)

# # Convert the 'Date' column to datetime
# final_df['Date'] = pd.to_datetime(final_df['Date'])

# # Sort the final DataFrame by date
# final_df.sort_values(by='Date', inplace=True)

# # Reset the index of the final DataFrame
# final_df.reset_index(drop=True, inplace=True)

# import yfinance as yf

# # Define the VIX ticker symbol
# vix_ticker = "^VIX"

# # Create a Ticker object for the VIX
# vix = yf.Ticker(vix_ticker)

# # Fetch historical data for the VIX
# vix_data = vix.history(start="2019-01-01", end="2019-05-30")

# # Reset the index of vix_data to convert the Date index into a column
# vix_data.reset_index(inplace=True)

# # Ensure the date format matches that of final_df and set it as the 'Date' column
# vix_data['Date'] = pd.to_datetime(vix_data['Date'], format='%Y-%m-%d')

# # Now merge the two DataFrames on the 'Date' column using an inner join
# # This will only keep rows that have matching dates in both DataFrames
# # final_df = pd.merge(final_df, vix_data[['Date', 'Close']], on='Date', how='inner')

# # # Rename the 'Close' column to something more descriptive
# # final_df.rename(columns={'Close': 'VIX_Close'}, inplace=True)

# # Concatenate final_df and vix_data vertically using pd.concat()
# final_df = pd.concat([final_df, vix_data[['Date', 'Close']]], axis=0)

# # Reset the index of the final DataFrame
# final_df.reset_index(drop=True, inplace=True)

# # Rename the 'Close' column to something more descriptive
# final_df.rename(columns={'Close': 'VIX_Close'}, inplace=True)


# # Your final_df now has an additional column 'VIX_Close' and only contains rows
# # for which both text data and VIX data are available.


In [None]:
# vix_data['Close'].head() #treat the closing value for the day as the vix for the day

In [None]:
# import yfinance as yf
# import pandas as pd

# # Define the VIX ticker symbol
# vix_ticker = "^VIX"

# # Create a Ticker object for the VIX
# vix = yf.Ticker(vix_ticker)

# # Fetch historical data for the VIX from its earliest available date to the present
# vix_data = vix.history(start="2019-01-01", end="2019-05-29")

In [None]:
# final_df