In [None]:
import os
import sys
os.chdir("../../../../../../../../")
os.chdir("home/user/mnt/degelin/thesis")
sys.path.append("tune_sets/")
!pip install matplotlib==3.7.0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
from datetime import datetime
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample, evaluation
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import torch.nn.functional as F
#import transformers
#import nltk
#import gzip
#import csv
#### Just some code to print debug information to stdout
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout

In [12]:
# give file_name without '.csv'!
def create_save_path(model_name):
    if '/' in model_name:
        model_name_parts = model_name.split('/')
        model_name_save_path = '-'.join(model_name_parts)
    else:
        model_name_save_path = model_name
    return model_name_save_path
def truncate_text(text, max_length, tokenizer):
    # Tokenize and encode the text, truncating to max_length
    encoded = tokenizer.encode(text, max_length=max_length, truncation=True)
    # Decode back to text
    truncated_text = tokenizer.decode(encoded, skip_special_tokens=True)
    # Double-check the encoded length
    final_encoded = tokenizer.encode(truncated_text, max_length=max_length, truncation=True)
    if len(final_encoded) > max_length:
        print(f"Warning: Truncated text still exceeds max length. Original length: {len(tokenizer.encode(text))}, Truncated length: {len(final_encoded)}")
        return None
    return truncated_text
def load_as_samples(file_name, max_length, tokenizer):
    error_tokenizations = 0
    base_path = 'tune_sets/llm/'
    test_part = pd.read_csv(base_path + file_name + '_test_part.csv')
    train_part = pd.read_csv(base_path + file_name + '_train_part.csv')
    val_part = pd.read_csv(base_path + file_name + '_val_part.csv')
    train_samples = []
    test_samples = []
    val_samples = []
    for i in range(len(train_part)):
        if i % 1000 == 0 :
            print("processing i", i)
        # Check if 'PROJECT_TITLE' and 'grant_text' are not missing
        text1 = train_part.iloc[i]['PROJECT_DESCRIPTION']
        text2 = train_part.iloc[i]['GRANT_DESCRIPTION']
        text1 = truncate_text(text1, max_length, tokenizer)
        text2 = truncate_text(text2, max_length, tokenizer)
        if text1 is None or text2 is None:
            error_tokenizations +=1
            continue
        inp_example = InputExample(texts=[text1, text2], label=float(train_part.iloc[i]['label']))
        train_samples.append(inp_example)
    del train_part
    for i in range(len(val_part)):
        if i % 1000 == 0 :
            print("processing i", i)
        # Check if 'PROJECT_TITLE' and 'grant_text' are not missing
        text1 = val_part.iloc[i]['PROJECT_DESCRIPTION']
        text2 = val_part.iloc[i]['GRANT_DESCRIPTION']
        text1 = truncate_text(text1, max_length, tokenizer)
        text2 = truncate_text(text2, max_length, tokenizer)
        if text1 is None or text2 is None:
            error_tokenizations +=1
            continue
        inp_example = InputExample(texts=[text1, text2], label=float(val_part.iloc[i]['label']))
        val_samples.append(inp_example)
    del val_part
    for i in range(len(test_part)):
        if i % 1000 == 0 :
            print("processing i", i)
        # Check if 'PROJECT_TITLE' and 'grant_text' are not missing
        text1 = test_part.iloc[i]['PROJECT_DESCRIPTION']
        text2 = test_part.iloc[i]['GRANT_DESCRIPTION']
        text1 = truncate_text(text1, max_length, tokenizer)
        text2 = truncate_text(text2, max_length, tokenizer)
        if text1 is None or text2 is None:
            error_tokenizations +=1
            continue
        inp_example = InputExample(texts=[text1, text2], label=float(test_part.iloc[i]['label']))
        test_samples.append(inp_example)
    del test_part
    return train_samples,val_samples,test_samples, error_tokenizations
def fine_tune(model_name, file_name):
    #!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu122
    import torch
    print("PyTorch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("CUDA version:", torch.version.cuda)
    print("Available devices:")
    print(torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"  Device {i}: {torch.cuda.get_device_name(i)}")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    model_name_save_path = create_save_path(model_name)
    model_save_path = (
    "good_code/fine_tuned_models/" + file_name + "/" + "fine_tuning-" + model_name_save_path + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    )
    max_token_length = 255
    original_model = SentenceTransformer(model_name)
    fine_tuned_model = SentenceTransformer(model_name)
    original_model.max_seq_length = max_token_length
    fine_tuned_model.max_seq_length = max_token_length
    tokenizer = original_model.tokenizer
    train_batch_size = 32
    num_epochs = 4
    
    eval_frequency = 0.1
    train_samples,val_samples,test_samples, error_tokenizations = load_as_samples(file_name, max_token_length, tokenizer)
    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=fine_tuned_model)
    val_dataloader = DataLoader(val_samples, shuffle=False, batch_size=train_batch_size)
    val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name="validation")
    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))
    
    steps_per_epoch = len(train_dataloader)
    evaluation_steps = max(1, int(steps_per_epoch * eval_frequency))
    logging.info(f"Steps per epoch: {steps_per_epoch}")
    logging.info(f"Evaluation steps: {evaluation_steps}")

    print("Checking DataLoader")
    #for batch in train_dataloader:
    #    print("Batch loaded:", batch)
    #    break
    for param in fine_tuned_model.parameters():
        param.data = param.data.contiguous()
    fine_tuned_model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=val_evaluator,
        epochs=num_epochs,
        evaluation_steps=evaluation_steps,
        warmup_steps=warmup_steps,
        output_path=model_save_path,
    )
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="model-evaluation")
    original_results = evaluator(original_model)
    fine_tuned_results = evaluator(fine_tuned_model)
    print("Original model results:")
    print(original_results)
    print("Fine-tuned model results:")
    print(fine_tuned_results)
    eval_path = os.path.join(model_save_path, 'eval')
    save2_path = os.path.join(eval_path, 'original_vs_tuned.txt')
    with open(save2_path, 'w') as file:
        file.write("Original Model Results:\n")
        file.write(f"{original_results}\n\n")
        file.write("Fine-Tuned Model Results:\n")
        file.write(f"{fine_tuned_results}\n")
        file.write("Error_tokenizations:\n")
        file.write(f"{error_tokenizations}\n")
    
    print(f"Results saved to {save2_path}")
    # Extract labels from InputExample objects
    true_labels = [example.label for example in test_samples]
    
    # Encode test sentences to obtain embeddings
    test_embeddings_1 = fine_tuned_model.encode([example.texts[0] for example in test_samples], 
                                            convert_to_tensor=True)
    test_embeddings_2 = fine_tuned_model.encode([example.texts[1] for example in test_samples], 
                                            convert_to_tensor=True)
    
    # Compute cosine similarity between pairs of embeddings
    cosine_similarities = [F.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)).item() 
                           for embedding1, embedding2 in zip(test_embeddings_1, test_embeddings_2)]
    thresholds = np.linspace(0, 1, 100)
    
    # Initialize lists to store accuracy for each threshold
    accuracies = []
    
    # Loop through each threshold
    for threshold in thresholds:
        # Classify based on cosine similarity and the current threshold
        predictions = [1 if cosine_sim > threshold else 0 for cosine_sim in cosine_similarities]
        
        # Compute accuracy
        accuracy = accuracy_score(true_labels, predictions)
        
        # Append accuracy to the list
        accuracies.append(accuracy)
    
    # Plot the accuracy for different thresholds
    plt.plot(thresholds, accuracies)
    plt.xlabel('Threshold')
    plt.ylabel('Accuracy')
    plt.grid(True)
    
    # Save the accuracy plot
    plot_path = os.path.join(eval_path, 'accuracy_plot.png')
    plt.savefig(plot_path)
    plt.close()
    
    # Find the best threshold
    best_threshold = thresholds[np.argmax(accuracies)]
    print("Threshold with highest accuracy:", best_threshold)
    
    # Classify based on cosine similarity (for example, using the best threshold)
    predictions = [1 if cosine_sim > best_threshold else 0 for cosine_sim in cosine_similarities]
    
    # Compute accuracy
    accuracy = accuracy_score(true_labels, predictions)
    
    # Compute other classification metrics
    classification_metrics = classification_report(true_labels, predictions)
    
    # Print the results
    print("Accuracy:", accuracy)
    print("Classification Metrics:\n", classification_metrics)
    
    # Save the classification metrics
    metrics_path = os.path.join(eval_path, 'classification_metrics.txt')
    with open(metrics_path, 'w') as file:
        file.write(f"Threshold with highest accuracy: {best_threshold}\n")
        file.write(f"Accuracy: {accuracy}\n\n")
        file.write("Classification Metrics:\n")
        file.write(classification_metrics)
    print(f"Classification metrics saved to {metrics_path}")
    print(f"Accuracy plot saved to {plot_path}")

In [13]:
set_data_names = []
model_names = []

In [14]:
for dataset_idx, dataset_name in enumerate(set_data_names):
    for model_idx, model_name in enumerate(model_names):
        # Show progress
        print(f"Now doing dataset {dataset_idx + 1} of {len(set_data_names)} on model {model_idx + 1} of {len(model_names)}: {model_name}")
        
        # Call the fine_tune function
        fine_tune(model_name, dataset_name)

Now doing dataset 1 of 1 on model 1 of 1: google/electra-base-discriminator
PyTorch version: 2.4.0+cu121
CUDA available: True
CUDA version: 12.1
Available devices:
1
  Device 0: NVIDIA RTX 6000 Ada Generation
Using device: cuda
2024-08-01 21:01:06 - Load pretrained SentenceTransformer: google/electra-base-discriminator




2024-08-01 21:01:06 - No sentence-transformers model found with name /home/user/.cache/torch/sentence_transformers/google_electra-base-discriminator. Creating a new one with MEAN pooling.
2024-08-01 21:01:06 - Use pytorch device: cuda
2024-08-01 21:01:06 - Load pretrained SentenceTransformer: google/electra-base-discriminator
2024-08-01 21:01:07 - No sentence-transformers model found with name /home/user/.cache/torch/sentence_transformers/google_electra-base-discriminator. Creating a new one with MEAN pooling.
2024-08-01 21:01:07 - Use pytorch device: cuda
processing i 0
processing i 1000
processing i 2000
processing i 3000
processing i 4000
processing i 5000
processing i 6000
processing i 7000
processing i 8000
processing i 9000
processing i 10000
processing i 11000
processing i 12000
processing i 13000
processing i 14000
processing i 15000
processing i 16000
processing i 17000
processing i 18000
processing i 19000
processing i 20000
processing i 21000
processing i 22000
processing i 

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2830 [00:00<?, ?it/s]

2024-08-01 21:32:17 - EmbeddingSimilarityEvaluator: Evaluating the model on validation dataset in epoch 0 after 283 steps:
2024-08-01 21:33:35 - Cosine-Similarity :	Pearson: 0.0083	Spearman: 0.0106
2024-08-01 21:33:35 - Manhattan-Distance:	Pearson: 0.0167	Spearman: 0.0175
2024-08-01 21:33:35 - Euclidean-Distance:	Pearson: 0.0096	Spearman: 0.0116
2024-08-01 21:33:35 - Dot-Product-Similarity:	Pearson: 0.0044	Spearman: 0.0003
2024-08-01 21:33:35 - Save model to good_code/fine_tuned_models/original_withoutNames/fine_tuning-google-electra-base-discriminator-2024-08-01_21-01-06
2024-08-01 21:35:51 - EmbeddingSimilarityEvaluator: Evaluating the model on validation dataset in epoch 0 after 566 steps:
2024-08-01 21:37:10 - Cosine-Similarity :	Pearson: 0.0439	Spearman: 0.0465
2024-08-01 21:37:10 - Manhattan-Distance:	Pearson: 0.0370	Spearman: 0.0433
2024-08-01 21:37:10 - Euclidean-Distance:	Pearson: 0.0358	Spearman: 0.0365
2024-08-01 21:37:10 - Dot-Product-Similarity:	Pearson: 0.0413	Spearman: 0

Iteration:   0%|          | 0/2830 [00:00<?, ?it/s]

2024-08-01 22:09:29 - EmbeddingSimilarityEvaluator: Evaluating the model on validation dataset in epoch 1 after 283 steps:
2024-08-01 22:10:47 - Cosine-Similarity :	Pearson: 0.8579	Spearman: 0.8115
2024-08-01 22:10:47 - Manhattan-Distance:	Pearson: 0.8491	Spearman: 0.8122
2024-08-01 22:10:47 - Euclidean-Distance:	Pearson: 0.8490	Spearman: 0.8121
2024-08-01 22:10:47 - Dot-Product-Similarity:	Pearson: 0.8346	Spearman: 0.7988
2024-08-01 22:10:47 - Save model to good_code/fine_tuned_models/original_withoutNames/fine_tuning-google-electra-base-discriminator-2024-08-01_21-01-06
2024-08-01 22:13:06 - EmbeddingSimilarityEvaluator: Evaluating the model on validation dataset in epoch 1 after 566 steps:
2024-08-01 22:14:25 - Cosine-Similarity :	Pearson: 0.8664	Spearman: 0.8170
2024-08-01 22:14:25 - Manhattan-Distance:	Pearson: 0.8577	Spearman: 0.8175
2024-08-01 22:14:25 - Euclidean-Distance:	Pearson: 0.8579	Spearman: 0.8176
2024-08-01 22:14:25 - Dot-Product-Similarity:	Pearson: 0.8431	Spearman: 0

Iteration:   0%|          | 0/2830 [00:00<?, ?it/s]

2024-08-01 22:46:36 - EmbeddingSimilarityEvaluator: Evaluating the model on validation dataset in epoch 2 after 283 steps:
2024-08-01 22:47:54 - Cosine-Similarity :	Pearson: 0.9145	Spearman: 0.8404
2024-08-01 22:47:54 - Manhattan-Distance:	Pearson: 0.9020	Spearman: 0.8405
2024-08-01 22:47:54 - Euclidean-Distance:	Pearson: 0.9025	Spearman: 0.8407
2024-08-01 22:47:54 - Dot-Product-Similarity:	Pearson: 0.8866	Spearman: 0.8312
2024-08-01 22:47:54 - Save model to good_code/fine_tuned_models/original_withoutNames/fine_tuning-google-electra-base-discriminator-2024-08-01_21-01-06
2024-08-01 22:50:10 - EmbeddingSimilarityEvaluator: Evaluating the model on validation dataset in epoch 2 after 566 steps:
2024-08-01 22:51:29 - Cosine-Similarity :	Pearson: 0.9152	Spearman: 0.8407
2024-08-01 22:51:29 - Manhattan-Distance:	Pearson: 0.9015	Spearman: 0.8407
2024-08-01 22:51:29 - Euclidean-Distance:	Pearson: 0.9020	Spearman: 0.8409
2024-08-01 22:51:29 - Dot-Product-Similarity:	Pearson: 0.8840	Spearman: 0

Iteration:   0%|          | 0/2830 [00:00<?, ?it/s]

2024-08-01 23:23:36 - EmbeddingSimilarityEvaluator: Evaluating the model on validation dataset in epoch 3 after 283 steps:
2024-08-01 23:24:54 - Cosine-Similarity :	Pearson: 0.9315	Spearman: 0.8471
2024-08-01 23:24:54 - Manhattan-Distance:	Pearson: 0.9185	Spearman: 0.8469
2024-08-01 23:24:54 - Euclidean-Distance:	Pearson: 0.9191	Spearman: 0.8470
2024-08-01 23:24:54 - Dot-Product-Similarity:	Pearson: 0.9003	Spearman: 0.8376
2024-08-01 23:24:54 - Save model to good_code/fine_tuned_models/original_withoutNames/fine_tuning-google-electra-base-discriminator-2024-08-01_21-01-06
2024-08-01 23:27:10 - EmbeddingSimilarityEvaluator: Evaluating the model on validation dataset in epoch 3 after 566 steps:
2024-08-01 23:28:29 - Cosine-Similarity :	Pearson: 0.9309	Spearman: 0.8469
2024-08-01 23:28:29 - Manhattan-Distance:	Pearson: 0.9185	Spearman: 0.8468
2024-08-01 23:28:29 - Euclidean-Distance:	Pearson: 0.9191	Spearman: 0.8470
2024-08-01 23:28:29 - Dot-Product-Similarity:	Pearson: 0.9001	Spearman: 0

Batches:   0%|          | 0/809 [00:00<?, ?it/s]

Batches:   0%|          | 0/809 [00:00<?, ?it/s]

Threshold with highest accuracy: 0.5555555555555556
Accuracy: 0.9652120134513548
Classification Metrics:
               precision    recall  f1-score   support

         0.0       0.98      0.95      0.96     12932
         1.0       0.95      0.98      0.97     12939

    accuracy                           0.97     25871
   macro avg       0.97      0.97      0.97     25871
weighted avg       0.97      0.97      0.97     25871

Classification metrics saved to good_code/fine_tuned_models/original_withoutNames/fine_tuning-google-electra-base-discriminator-2024-08-01_21-01-06/eval/classification_metrics.txt
Accuracy plot saved to good_code/fine_tuned_models/original_withoutNames/fine_tuning-google-electra-base-discriminator-2024-08-01_21-01-06/eval/accuracy_plot.png
