In [None]:
# Install gdown package
!pip install gdown

# Download the file from Google Drive
!gdown --id 1K-_LJP2Ux3KtTtG98xSkKHZUi2q_xNcq -O models.zip

# Unzip the downloaded file
!unzip models.zip -d /content/models


Downloading...
From (original): https://drive.google.com/uc?id=1K-_LJP2Ux3KtTtG98xSkKHZUi2q_xNcq
From (redirected): https://drive.google.com/uc?id=1K-_LJP2Ux3KtTtG98xSkKHZUi2q_xNcq&confirm=t&uuid=5c0b6e67-9c18-4732-aade-de9e98d418f6
To: /content/models.zip
100% 408M/408M [00:03<00:00, 119MB/s]
Archive:  models.zip
replace /content/models/config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: /content/models/config.json  
  inflating: /content/models/special_tokens_map.json  
  inflating: /content/models/tokenizer.json  
  inflating: /content/models/model.safetensors  
  inflating: /content/models/tokenizer_config.json  
  inflating: /content/models/vocab.txt  
  inflating: /content/models/nlp_lstm_finetuned.pth  


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import random
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
import os

# Path to the fine-tuned model on Google Drive
model_path = "/content/models"

def list_model_path_elements(path):
    try:
        elements = os.listdir(path)
        for element in elements:
            print(element)
    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function to list elements
list_model_path_elements(model_path)

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

# Download and load stopwords
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

def mask_word_tokens(text, tokenizer, mask_probability=0.40):
    """
    Mask tokens randomly, excluding punctuation, stop words, and one-character words.
    """
    tokens = tokenizer.tokenize(text)
    tokens_with_pos = pos_tag(tokens)

    # Exclude stopwords, punctuation, and one-character words
    eligible_tokens = [
        i for i, (token, pos) in enumerate(tokens_with_pos)
        if token.lower() not in stop_words and len(token) > 1 and token.isalnum()
    ]

    num_tokens_to_mask = max(1, int(len(eligible_tokens) * mask_probability))
    mask_indices = random.sample(eligible_tokens, num_tokens_to_mask)

    masked_tokens = tokens.copy()
    for idx in mask_indices:
        masked_tokens[idx] = tokenizer.mask_token

    return tokenizer.convert_tokens_to_string(masked_tokens)

def predict_masked_tokens(text_list, model, tokenizer):
    for test_text in text_list:
        masked_text = mask_word_tokens(test_text, tokenizer, mask_probability=0.10)
        print(f"Original text ---- {test_text}")
        print(f"Masked text ---- {masked_text}")

        inputs = tokenizer(masked_text, return_tensors="pt")

        with torch.no_grad():
            logits = model(**inputs).logits

        # Find the indices of the masked tokens
        mask_token_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

        # Decode the predicted tokens
        predicted_tokens = []
        for index in mask_token_indices:
            predicted_token_id = logits[0, index].argmax(axis=-1)
            predicted_token = tokenizer.decode(predicted_token_id)
            predicted_tokens.append(predicted_token)

        # Replace the [MASK] tokens with the predicted tokens
        output_text = masked_text
        for predicted_token in predicted_tokens:
            output_text = output_text.replace('[MASK]', predicted_token, 1)

        print(f"Predicted text: {output_text}")


test_texts = [
    "Derivatives are financial contracts, set between two or more parties, that derive their value from an underlying asset, group of assets, or benchmark."
    ,"To hedge, in finance, is to take an offsetting position in an asset or investment that reduces the price risk of an existing position. A hedge is therefore a trade that is made with the purpose of reducing the risk of adverse price movements in another asset. Normally, a hedge consists of taking the opposite position in a related security or in a derivative security based on the asset to be hedged."
    ,"Financial exposure is the amount an investor stands to lose in an investment should the investment fail. For example, the financial exposure involved in purchasing a car would be the initial investment amount minus the insured portion. Knowing and understanding financial exposure, which is an alternative name for risk, is a crucial part of the investment process."
    ,"Unsecured Debt Definition: Unsecured debts are loans that are not collateralized. They generally require higher interest rates because they offer the lender limited protection against default. Lenders can mitigate this risk by reporting defaults to credit rating agencies."
    ,"Market capitalization, or market cap, represents the total dollar market value of a company's outstanding shares of stock. Investors use this figure to determine a company's size instead of sales or total asset value. In an acquisition, the market cap helps determine whether a takeover candidate represents a good value for the acquirer."
]

predict_masked_tokens(test_texts, model, tokenizer)


model.safetensors
vocab.txt
special_tokens_map.json
tokenizer.json
tokenizer_config.json
nlp_lstm_finetuned.pth
config.json


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Original text ---- Derivatives are financial contracts, set between two or more parties, that derive their value from an underlying asset, group of assets, or benchmark.
Masked text ---- derivatives are financial [MASK], set between two or more parties, that derive their value from an underlying asset, group of assets, or benchmark.
Predicted text: derivatives are financial instruments, set between two or more parties, that derive their value from an underlying asset, group of assets, or benchmark.
Original text ---- To hedge, in finance, is to take an offsetting position in an asset or investment that reduces the price risk of an existing position. A hedge is therefore a trade that is made with the purpose of reducing the risk of adverse price movements in another asset. Normally, a hedge consists of taking the opposite position in a related security or in a derivative security based on the asset to be hedged.
Masked text ---- to hedge, in finance, is to take an offsetting position in

In [None]:

import gdown

url = 'https://drive.google.com/uc?id=1Ia_4o4JJDoyIkPcbMC_eZbWE_1DpuerL'
output = 'data.zip'
gdown.download(url, output, quiet=False)

import zipfile
import os

with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall('/content')


Downloading...
From: https://drive.google.com/uc?id=1Ia_4o4JJDoyIkPcbMC_eZbWE_1DpuerL
To: /content/data.zip
100%|██████████| 12.2M/12.2M [00:00<00:00, 25.7MB/s]


In [None]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.2.2-py3-none-any.whl (973 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/973.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/973.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.6/973.6 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.4-py3-none-any.whl (310 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.4/310.4 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.1-py3-none-any.whl (23 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.74-py3-none-any.whl (124 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load dataset
csv_file = '/content/data/updated_file_pre_2006.csv'
data = pd.read_csv(csv_file)

# Load the tokenizer
model_path = "/content/models"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Preprocess the sample content
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " "]
)

def split_text(text):
    chunks = text_splitter.split_text(text)
    return chunks

# Load the trained LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add batch dimension for LSTM
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:, -1, :]  # Get the last output of LSTM
        out = self.fc(out)
        return out

input_dim = 768  # The dimension of the embeddings
hidden_dim = 128
output_dim = 2  # Positive or Negative
num_layers = 2

lstm_model = LSTMClassifier(input_dim, hidden_dim, output_dim, num_layers)
lstm_model.load_state_dict(torch.load('/content/models/nlp_lstm_finetuned.pth'))
lstm_model.to(device)
lstm_model.eval()

# Function to get CLS embeddings for the sample chunks
def get_cls_embeddings(texts, tokenizer, model, device):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
    return cls_embeddings.cpu().numpy()

# Label encoder for the actual classification
label_encoder = LabelEncoder()
label_encoder.fit([0, 1])  # Assuming 0 for Negative and 1 for Positive

# Function to test the model accuracy
def test_model_accuracy(data, num_samples, tokenizer, model, lstm_model, device):
    correct_predictions = 0

    for i in range(num_samples):
        sample_content = data['content'].iloc[i]
        actual_classification = data['Classification'].iloc[i]

        sample_chunks = split_text(sample_content)

        # Get embeddings for the sample chunks
        sample_embeddings = get_cls_embeddings(sample_chunks, tokenizer, model, device)

        # Prepare the embeddings for the LSTM model
        sample_embeddings_tensor = torch.tensor(sample_embeddings, dtype=torch.float32).to(device)

        # Predict the classification
        with torch.no_grad():
            outputs = lstm_model(sample_embeddings_tensor)
            _, predicted = torch.max(outputs, 1)
            predicted_label = predicted.cpu().numpy()[0]

        # Transform actual classification label
        actual_label_transformed = 1 if actual_classification == 1.0 else 0

        # Check if the prediction is correct
        if predicted_label == actual_label_transformed:
            correct_predictions += 1

        # Print the classification
        classification = label_encoder.inverse_transform([predicted_label])[0]
        print(f"Content: {sample_content[:100]}...")  # Print first 100 characters of content for brevity
        print(f"Actual Classification: {'Positive' if actual_label_transformed == 1 else 'Negative'}")
        print(f"Predicted Classification: {'Positive' if classification == 1 else 'Negative'}\n")

    accuracy = correct_predictions / num_samples
    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Assert if accuracy is at least 80%
    assert accuracy >= 0.80, f"Test failed with accuracy {accuracy * 100:.2f}%"

# Run the test
test_model_accuracy(data, num_samples=10, tokenizer=tokenizer, model=model, lstm_model=lstm_model, device=device)


Some weights of BertModel were not initialized from the model checkpoint at /content/models and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Content: I discovered when I joined the Board of Governors of the Federal Reserve System about six months ago...
Actual Classification: Negative
Predicted Classification: Negative

Content: I am privileged to accept the Union League of Philadelphia's Abraham Lincoln award.  This is the fir...
Actual Classification: Positive
Predicted Classification: Positive

Content: The Challenge of Central Banking in a Democratic SocietyGood evening ladies and gentlemen.  I am esp...
Actual Classification: Positive
Predicted Classification: Negative

Content: It is a pleasure to be with you this morning to discuss private-sector payments risk management in o...
Actual Classification: Positive
Predicted Classification: Positive

Content: It is a pleasure to be here and participate in your discussions of current changes in bank regulator...
Actual Classification: Positive
Predicted Classification: Negative

Content: The Transformation of the U.S. Banking Industry and Resulting Challenges to Regulators

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Load dataset
csv_file = '/content/data/updated_file_pre_2006.csv'
data = pd.read_csv(csv_file)

# Ensure NLTK's punkt tokenizer models are available
nltk.download('punkt')

# Calculate the average number of tokens in the 'content' column
data['token_count'] = data['content'].apply(lambda x: len(word_tokenize(str(x))))
average_tokens = data['token_count'].mean()

print(f"The average number of tokens in the 'content' column is {average_tokens}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The average number of tokens in the 'content' column is 43496.335913312694
