In [7]:
import pandas as pd
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lorenzoleuzzi/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [22]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

In [27]:
# glucose_in_blood, white_blood_cells, billirubin_in_plasma
query = 'glucose in blood'
tokenized_query = word_tokenize(query.lower())

if query == 'glucose in blood': sheet_name = 0
if query == 'white blood cells': sheet_name = 1
if query == 'billirubin in plasma': sheet_name = 2

df_query = pd.read_excel('loinc_extended.xlsx', sheet_name = 0)
tokenized_docs = [word_tokenize(row['LONG_COMMON_NAME'].lower()) for i, row in df_query.iterrows()]

In [17]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(df_query['LONG_COMMON_NAME'])
query_vector = tfidf_vectorizer.transform([query])

cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)

In [19]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union

jaccard_scores = []

for doc in df_query['LONG_COMMON_NAME']:
    doc_tokens = set(word_tokenize(doc.lower()))
    jaccard_score = jaccard_similarity(set(tokenized_query), doc_tokens)
    jaccard_scores.append(jaccard_score)

In [25]:
bm25 = BM25Okapi(tokenized_docs)
bm_25_scores = bm25.get_scores(tokenized_query)

In [28]:
# Get the IDF values for the terms in the query
idf_values_query = [tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_.get(term, 0)] for term in tokenized_query]

# Get the IDF values for each document
idf_values_documents = tfidf_matrix.sum(axis=1).A1  # Sum of IDF values for each document

In [33]:
print(idf_values_documents.shape)
print(bm_25_scores.shape)
print(cosine_similarities[0].shape)
print(len(jaccard_scores))

(215,)
(215,)
(215,)
215


In [36]:
features = {
    "IDF": idf_values_documents,
    "BM25": bm_25_scores,
    "COISINE": cosine_similarities[0],
    "JACCARD": jaccard_scores
}
features_df = pd.DataFrame(features)
features_df

In [38]:
df_features = pd.concat([df_query, features_df], axis=1)
df_features

Unnamed: 0,LOINC_NUM,LONG_COMMON_NAME,COMPONENT,SYSTEM,PROPERTY,RANK,IDF,BM25,COISINE,JACCARD
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein,Ser/Plas,MCnc,0,2.457763,1.066813,0.055557,0.083333
1,1959-6,Bicarbonate [Moles/volume] in Blood,Bicarbonate,Bld,SCnc,0,1.889801,2.323347,0.305501,0.285714
2,10331-7,Rh [Type] in Blood,Rh,Bld,Type,0,1.785689,2.323347,0.282663,0.285714
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility],Trimethoprim+Sulfamethoxazole,Isolate,Susc,0,1.711053,0.000000,0.000000,0.000000
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plasma,Bilirubin,Ser/Plas,MCnc,0,2.549971,1.169493,0.062231,0.100000
...,...,...,...,...,...,...,...,...,...,...
210,33490-4,Hydrogen/Expired gas [Volume Fraction] in Exha...,Hydrogen/Expired gas^pre dose lactose PO,Exhl gas,VFr,0,2.928888,0.907464,0.034804,0.066667
211,33504-2,Hydrogen+Methane/Expired gas [Volume Fraction]...,Hydrogen+Methane/Expired gas^pre dose lactose PO,Exhl gas,VFr,0,3.094776,0.907464,0.032837,0.066667
212,33498-7,Methane/Expired gas [Volume Fraction] in Exhal...,Methane/Expired gas^30M post dose lactose PO,Exhl gas,VFr,0,3.197011,0.844400,0.033196,0.058824
213,33482-1,Methane/Expired gas [Volume Fraction] in Exhal...,Methane/Expired gas^3H post dose lactose PO,Exhl gas,VFr,0,3.018794,0.844400,0.034955,0.058824


In [None]:
df_features.to_excel(f'{query}_features.xlsx', index=False)