<a href="https://colab.research.google.com/github/lrahbek/bachelor_scz/blob/main/analysis_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instrument overlap analysis using sentence transformer models

### Set up: package import

In [None]:
pip install -U sentence-transformers

In [None]:
# Import packages
import os
import numpy as np
import pandas as pd
import torch
import sentence_transformers
import pickle
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, util, evaluation
from sentence_transformers.evaluation import TripletEvaluator, BinaryClassificationEvaluator

In [None]:
# Set working directory
os.chdir("drive/MyDrive/Bachelor")

### Load in item files

In [None]:
# Download items, codes and scale files
items_txt = open("items.txt", "r")
items_str = items_txt.read()
items = items_str.split("\n")
print(items)
items_txt.close()

codes_txt = open("code.txt", "r")
codes_str = codes_txt.read()
codes = codes_str.split("\n")
print(codes)
codes_txt.close()

scales_txt = open("scale.txt", "r")
scales_str = scales_txt.read()
scales = scales_str.split("\n")
print(scales)
scales_txt.close()

names_txt = open("name.txt", "r")
names_str = names_txt.read()
names = names_str.split("\n")
print(names)
names_txt.close()

['SS11', 'SS12', 'SS13', 'SS14', 'SS15', 'SS16', 'SS17', 'SS18', 'SS19', 'SS20', 'SS21', 'SS22', 'SS23', 'SS24', 'SS25', 'SS26', 'SS27', 'SS28', 'SS29', 'SS30', 'SS31', 'SS32', 'SS33', 'SS34', 'SS35', 'SS36', 'SS37', 'SS38', 'SS39', 'SS40', 'SS41', 'SS42', 'SS43', 'SS44', 'SS45', 'SS46', 'SS47', 'SS48', 'SS49', 'SS50', 'SS51', 'SS52', 'SS53', 'SS54', 'SS55', 'SS56', 'SS57', 'SS58', 'SS59', 'SS60', 'SS61', 'SS62', 'SS63', 'SS64', 'SS65', 'SS66', 'SS67', 'SS68', 'SS69', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'G1', 'G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'G10', 'G11', 'G12', 'G13', 'G14', 'G15', 'G16', 'A1', 'A2', 'A3', 'A4', 'A5', 'A', 'B', 'C', 'D', 'E', 'F', 'G']
['SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', 'SANS_SAPS', '

### Load in data frame with manually coded overlap (0/1)


In [None]:
# Download df with all symptom pairs and similairy column
bc_eval_df = pd.read_csv('bc_eval_df.csv')
code_item = { codes[i]: items[i] for i in range(len(codes))} #make dict of codes and items
bc_eval_df = bc_eval_df.replace(to_replace = code_item) #replace codes with text (items)

### Load in sentence transformer models

In [None]:
# Download sentence transformer models
model1 = SentenceTransformer('all-mpnet-base-v1')
print("Max Sequence Length:", model1.max_seq_length)

model2 = SentenceTransformer('all-distilroberta-v1')
print("Max Sequence Length:", model2.max_seq_length)

model3 = SentenceTransformer("neuml/pubmedbert-base-embeddings")
print("Max Sequence Length:", model3.max_seq_length)


### Evaluate encodings on manually coded overlap

In [None]:
evalbc = evaluation.BinaryClassificationEvaluator(sentences1 = (list(bc_eval_df["item1"])), sentences2 = (list(bc_eval_df["item2"])), labels = (list(bc_eval_df["bc_overlap"])), name = "BC_eval", batch_size = 16, show_progress_bar = True, write_csv = True)
print(model1.evaluate(evalbc, output_path= "/content/drive/MyDrive/Bachelor/"))
print(model2.evaluate(evalbc, output_path= "/content/drive/MyDrive/Bachelor/"))
print(model3.evaluate(evalbc, output_path= "/content/drive/MyDrive/Bachelor/"))

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

0.5805403707467288


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

0.5457112959128922


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

0.5447213012138723


### Encode embeddings and calculated cosine scores

In [None]:
# Calculate sentence embeddings
embeddings1 = model1.encode(items,  convert_to_tensor=True)
for item, embedding in zip(items, embeddings1):
    print("Item:", item)
    print("Embedding:", embedding)
    print("")

embeddings2 = model2.encode(items,  convert_to_tensor=True)
for item, embedding in zip(items, embeddings2):
    print("Item:", item)
    print("Embedding:", embedding)
    print("")

embeddings3 = model3.encode(items,  convert_to_tensor=True)
for item, embedding in zip(items, embeddings3):
    print("Item:", item)
    print("Embedding:", embedding)
    print("")

In [None]:
#Compute cosine similarities
cosine_scores1 = util.cos_sim(embeddings1, embeddings1)
cosine_scores2 = util.cos_sim(embeddings2, embeddings2)
cosine_scores3 = util.cos_sim(embeddings3, embeddings3)

### Export cosine scores for all pairs for each model

In [None]:
# Export cosine scores to R
cos1_pd = pd.DataFrame(cosine_scores1)
cos1_pd.columns = codes
cos1_pd.index = codes
cos1_pd.to_csv("cossco1.csv", index=True)

cos2_pd = pd.DataFrame(cosine_scores2)
cos2_pd.columns = codes
cos2_pd.index = codes
cos2_pd.to_csv("cossco2.csv", index=True)

cos3_pd = pd.DataFrame(cosine_scores3)
cos3_pd.columns = codes
cos3_pd.index = codes
cos3_pd.to_csv("cossco3.csv", index=True)