In [None]:
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1gDuwzgSk8rxUnQKR0Hyn70A5SMt1l4_9' -O data.zip

!unzip data.zip

import os
import pandas as pd

directory = './data/investopedia'
dataframes = []

for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dataframes.append(df)

df = pd.concat(dataframes, ignore_index=True)

df = df.loc[df['Title'] != 'No Title Found']
text = pd.Series(df['Title'] + df['Summary'], dtype=str).reset_index(drop=True)



--2024-05-15 13:19:09--  https://drive.google.com/uc?export=download&id=1gDuwzgSk8rxUnQKR0Hyn70A5SMt1l4_9
Resolving drive.google.com (drive.google.com)... 172.253.118.113, 172.253.118.139, 172.253.118.101, ...
Connecting to drive.google.com (drive.google.com)|172.253.118.113|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1gDuwzgSk8rxUnQKR0Hyn70A5SMt1l4_9&export=download [following]
--2024-05-15 13:19:10--  https://drive.usercontent.google.com/download?id=1gDuwzgSk8rxUnQKR0Hyn70A5SMt1l4_9&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.251.175.132, 2404:6800:4003:c1c::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.251.175.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 988723 (966K) [application/octet-stream]
Saving to: ‘data.zip’


2024-05-15 13:19:13 (92.3 MB/s) - ‘data.zip’ saved [988723/98872

In [None]:
!pip install transformers[torch]
# !pip install torch
!pip install datasets

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
from google.colab import drive
from sklearn.model_selection import train_test_split
drive.mount('/content/drive')

# Assuming you have a directory 'finbert_finetuned' in the root of your Google Drive
path_to_save = "/content/drive/My Drive/finbert_finetuned"

import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import json

# Tokenizer and Model Initialization with Google Drive paths
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

df = text
# Dataset Preprocessing
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Split dataset into train and test sets
train_texts, test_texts = train_test_split(df.tolist(), test_size=0.1, random_state=42)
train_dataset = Dataset.from_dict({"text": train_texts})
test_dataset = Dataset.from_dict({"text": test_texts})

train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])
test_dataset = test_dataset.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])

# Define Data Collator for Masked Language Modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)


# Trainer Configuration
training_args = TrainingArguments(
    output_dir=path_to_save,
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir=path_to_save + "/logs",
    learning_rate=3e-5,
    weight_decay=0.01,
)


# Define the compute_metrics function to calculate perplexity
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits and labels from NumPy arrays to PyTorch tensors
    logits = torch.tensor(logits)
    labels = torch.tensor(labels)
    # Shift so that tokens < n predict n
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    # Flatten the tokens
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

    perplexity = torch.exp(loss)
    return {"perplexity": perplexity.item()}


# Train and Save the Model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(path_to_save)
tokenizer.save_pretrained(path_to_save)

# Save Trainer State
trainer.state.save_to_json(path_to_save + "/trainer_state.json")

# Save Training Arguments
with open(path_to_save + "/training_args.json", "w") as f:
    json.dump(training_args.to_dict(), f)

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForMaskedLM were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4878 [00:00<?, ? examples/s]

Map:   0%|          | 0/542 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Perplexity
1,No log,3.517659,10717.564453
2,4.280100,2.759457,20862.453125
3,4.280100,2.47253,27508.4375
4,2.763900,2.276683,40724.296875
5,2.353000,2.159971,44856.320312
6,2.353000,2.107136,60343.742188
7,2.126600,2.055084,67783.492188
8,2.126600,1.950807,75854.929688
9,1.974700,1.872055,83309.3125
10,1.889400,1.866143,88378.523438


In [None]:
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import random
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Path to the fine-tuned model on Google Drive
model_path = "/content/drive/My Drive/finbert_finetuned"
import os
def list_model_path_elements(path):
    try:
        elements = os.listdir(path)
        for element in elements:
            print(element)
    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function to list elements
list_model_path_elements(model_path)


# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

# Download and load stopwords
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

def mask_word_tokens(text, tokenizer, mask_probability=0.40):
    """
    Mask tokens randomly, excluding punctuation, stop words, and one-character words.
    """
    tokens = tokenizer.tokenize(text)
    tokens_with_pos = pos_tag(tokens)

    # Exclude stopwords, punctuation, and one-character words
    eligible_tokens = [
        i for i, (token, pos) in enumerate(tokens_with_pos)
        if token.lower() not in stop_words and len(token) > 1 and token.isalnum()
    ]

    num_tokens_to_mask = max(1, int(len(eligible_tokens) * mask_probability))
    mask_indices = random.sample(eligible_tokens, num_tokens_to_mask)

    masked_tokens = tokens.copy()
    for idx in mask_indices:
        masked_tokens[idx] = tokenizer.mask_token

    return tokenizer.convert_tokens_to_string(masked_tokens)

def predict_masked_tokens(test_text, model, tokenizer):
    masked_text = mask_word_tokens(test_text, tokenizer, mask_probability=0.10)
    print(f"Original text ---- {test_text}")
    print(f"Masked text ---- {masked_text}")

    inputs = tokenizer(masked_text, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    # Find the indices of the masked tokens
    mask_token_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

    # Decode the predicted tokens
    predicted_tokens = []
    for index in mask_token_indices:
        predicted_token_id = logits[0, index].argmax(axis=-1)
        predicted_token = tokenizer.decode(predicted_token_id)
        predicted_tokens.append(predicted_token)

    # Replace the [MASK] tokens with the predicted tokens
    output_text = masked_text
    for predicted_token in predicted_tokens:
        output_text = output_text.replace('[MASK]', predicted_token, 1)

    print(f"Predicted text: {output_text}")

# Example text to use for prediction, replace 'text[60]' with an actual string if 'text' is not defined
test_text = "Unsecured Debt Definition: Unsecured debts are loans that are not collateralized. They generally require higher interest rates because they offer the lender limited protection against default. Lenders can mitigate this risk by reporting defaults to credit rating agencies."
predict_masked_tokens(test_text, model, tokenizer)


Mounted at /content/drive
logs
checkpoint-305
checkpoint-610
checkpoint-915
checkpoint-1220
checkpoint-1525
checkpoint-1830
checkpoint-2135
checkpoint-2440
checkpoint-2745
checkpoint-3050
checkpoint-3355
checkpoint-3660
checkpoint-3965
checkpoint-4270
checkpoint-4575
checkpoint-4880
checkpoint-5185
checkpoint-5490
checkpoint-5795
checkpoint-6100
model.safetensors
config.json
trainer_state.json
special_tokens_map.json
training_args.bin
training_args.json
generation_config.json
vocab.txt
tokenizer.json
tokenizer_config.json
Original text ---- Unsecured Debt Definition: Unsecured debts are loans that are not collateralized. They generally require higher interest rates because they offer the lender limited protection against default. Lenders can mitigate this risk by reporting defaults to credit rating agencies.
Masked text ---- unsecured debt definition : unsecured debts are loans that are not collateralized. they generally require higher interest rates because they offer the lender limit

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import random
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
import os
import shutil

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Path to the fine-tuned model on Google Drive
model_path = "/content/drive/My Drive/finbert_finetuned"
local_model_path = "./finbert_finetuned"

# List of necessary files
necessary_files = [
    "model.safetensors",
    "config.json",
    "vocab.txt",
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json"
]

# Copy necessary files to local directory
os.makedirs(local_model_path, exist_ok=True)
for file_name in necessary_files:
    src = os.path.join(model_path, file_name)
    dst = os.path.join(local_model_path, file_name)
    shutil.copy(src, dst)

# Load the fine-tuned model and tokenizer from local directory
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForMaskedLM.from_pretrained(local_model_path)

# Download and load stopwords
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

def mask_word_tokens(text, tokenizer, mask_probability=0.40):
    """
    Mask tokens randomly, excluding punctuation, stop words, and one-character words.
    """
    tokens = tokenizer.tokenize(text)
    tokens_with_pos = pos_tag(tokens)

    # Exclude stopwords, punctuation, and one-character words
    eligible_tokens = [
        i for i, (token, pos) in enumerate(tokens_with_pos)
        if token.lower() not in stop_words and len(token) > 1 and token.isalnum()
    ]

    num_tokens_to_mask = max(1, int(len(eligible_tokens) * mask_probability))
    mask_indices = random.sample(eligible_tokens, num_tokens_to_mask)

    masked_tokens = tokens.copy()
    for idx in mask_indices:
        masked_tokens[idx] = tokenizer.mask_token

    return tokenizer.convert_tokens_to_string(masked_tokens)

def predict_masked_tokens(test_text, model, tokenizer):
    masked_text = mask_word_tokens(test_text, tokenizer, mask_probability=0.10)
    print(f"Original text ---- {test_text}")
    print(f"Masked text ---- {masked_text}")

    inputs = tokenizer(masked_text, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    # Find the indices of the masked tokens
    mask_token_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

    # Decode the predicted tokens
    predicted_tokens = []
    for index in mask_token_indices:
        predicted_token_id = logits[0, index].argmax(axis=-1)
        predicted_token = tokenizer.decode(predicted_token_id)
        predicted_tokens.append(predicted_token)

    # Replace the [MASK] tokens with the predicted tokens
    output_text = masked_text
    for predicted_token in predicted_tokens:
        output_text = output_text.replace('[MASK]', predicted_token, 1)

    print(f"Predicted text: {output_text}")

# Example text to use for prediction, replace 'text[60]' with an actual string if 'text' is not defined
test_text = "Unsecured Debt Definition: Unsecured debts are loans that are not collateralized. They generally require higher interest rates because they offer the lender limited protection against default. Lenders can mitigate this risk by reporting defaults to credit rating agencies."
predict_masked_tokens(test_text, model, tokenizer)


Mounted at /content/drive
Original text ---- Unsecured Debt Definition: Unsecured debts are loans that are not collateralized. They generally require higher interest rates because they offer the lender limited protection against default. Lenders can mitigate this risk by reporting defaults to credit rating agencies.
Masked text ---- unsecured debt definition : unsecured debts are loans that are not collateralized. they generally require higher interest rates because they offer the lender limited protection against [MASK]. lenders can mitigate this risk by [MASK] defaults to credit rating agencies.
Predicted text: unsecured debt definition : unsecured debts are loans that are not collateralized. they generally require higher interest rates because they offer the lender limited protection against default. lenders can mitigate this risk by reporting defaults to credit rating agencies.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
