In [49]:
import pandas as pd
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lorenzoleuzzi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

In [51]:
# glucose_in_blood, white_blood_cells, billirubin_in_plasma
query = 'billirubin in plasma'
tokenized_query = word_tokenize(query.lower())

if query == 'glucose in blood': sheet_name = 0
if query == 'white blood cells': sheet_name = 1
if query == 'billirubin in plasma': sheet_name = 2

df_query = pd.read_excel('loinc.xlsx', sheet_name = 0)
tokenized_docs = [word_tokenize(row['LONG_COMMON_NAME'].lower()) for i, row in df_query.iterrows()]

In [52]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(df_query['LONG_COMMON_NAME'])
query_vector = tfidf_vectorizer.transform([query])

cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)

In [53]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union

jaccard_scores = []

for doc in df_query['LONG_COMMON_NAME']:
    doc_tokens = set(word_tokenize(doc.lower()))
    jaccard_score = jaccard_similarity(set(tokenized_query), doc_tokens)
    jaccard_scores.append(jaccard_score)

In [54]:
bm25 = BM25Okapi(tokenized_docs)
bm_25_scores = bm25.get_scores(tokenized_query)

In [55]:
# Get the IDF values for the terms in the query
idf_values_query = [tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_.get(term, 0)] for term in tokenized_query]

# Get the IDF values for each document
idf_values_documents = tfidf_matrix.sum(axis=1).A1  # Sum of IDF values for each document

In [56]:
print(idf_values_documents.shape)
print(bm_25_scores.shape)
print(cosine_similarities[0].shape)
print(len(jaccard_scores))

(67,)
(67,)
(67,)
67


In [57]:
features = {
    "IDF": idf_values_documents,
    "BM25": bm_25_scores,
    "COISINE": cosine_similarities[0],
    "JACCARD": jaccard_scores
}
features_df = pd.DataFrame(features)
features_df

Unnamed: 0,IDF,BM25,COISINE,JACCARD
0,2.584521,0.863510,0.306439,0.181818
1,2.015035,0.920180,0.124813,0.125000
2,1.862647,0.920180,0.126561,0.125000
3,1.697418,0.000000,0.000000,0.000000
4,2.675799,0.961662,0.343756,0.222222
...,...,...,...,...
62,2.110544,1.055547,0.092146,0.166667
63,1.377624,0.000000,0.000000,0.000000
64,1.377624,0.000000,0.000000,0.000000
65,2.882682,0.821582,0.232481,0.166667


In [58]:
df_features = pd.concat([df_query, features_df], axis=1)
df_features

Unnamed: 0,LOINC_NUM,LONG_COMMON_NAME,COMPONENT,SYSTEM,PROPERTY,RANK,IDF,BM25,COISINE,JACCARD
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein,Ser/Plas,MCnc,0,2.584521,0.863510,0.306439,0.181818
1,1959-6,Bicarbonate [Moles/volume] in Blood,Bicarbonate,Bld,SCnc,0,2.015035,0.920180,0.124813,0.125000
2,10331-7,Rh [Type] in Blood,Rh,Bld,Type,0,1.862647,0.920180,0.126561,0.125000
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility],Trimethoprim+Sulfamethoxazole,Isolate,Susc,0,1.697418,0.000000,0.000000,0.000000
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plasma,Bilirubin,Ser/Plas,MCnc,0,2.675799,0.961662,0.343756,0.222222
...,...,...,...,...,...,...,...,...,...,...
62,54439-5,Calcium bilirubinate/Total in Stone,Calcium bilirubinate/Total,Calculus,MFr,1,2.110544,1.055547,0.092146,0.166667
63,18878-9,Cefazolin [Susceptibility],Cefazolin,Isolate,Susc,1,1.377624,0.000000,0.000000,0.000000
64,18928-2,Gentamicin [Susceptibility],Gentamicin,Isolate,Susc,1,1.377624,0.000000,0.000000,0.000000
65,29265-6,Calcium [Moles/volume] corrected for albumin i...,Calcium^^corrected for albumin,Ser/Plas,SCnc,1,2.882682,0.821582,0.232481,0.166667


In [59]:
df_features.to_excel(f'{query}_features.xlsx', index=False)