In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gleblegotkin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gleblegotkin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def tfidf(file_in, file_out):
    # Load your dataset
    df = pd.read_csv(file_in)

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
    tfidf_matrix = vectorizer.fit_transform(df['vectorization_text'])

    # Determine the number of components
    n_features = tfidf_matrix.shape[1]
    n_components = min(768, n_features)
    if n_components < 768:
        print(f"Warning: Only {n_features} features available. Reducing dimensions to {n_components} instead of 768.")

    # Dimensionality reduction
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    reduced_matrix = svd.fit_transform(tfidf_matrix)

    # Format output
    df_embeddings = pd.DataFrame({
        "ICD11_code": df["code"],
        "Vector": [np.array2string(vec, separator=',', precision=6, suppress_small=True) for vec in reduced_matrix]
    })

    # Print information
    print(f"Number of rows in resulting DataFrame: {len(df_embeddings)}")
    print(f"Shape of the first vector: {reduced_matrix[0].shape}")

    # Save to CSV
    df_embeddings.to_csv(file_out, index=False)


In [None]:
# embeddings for ICD11 codes
tfidf("ICD11_preprocessed.csv", "tfidf_ICD11_embeddings.csv")

Number of rows in resulting DataFrame: 13062
Shape of the first vector: (768,)


In [None]:
# embeddings for Merriam-Webster definitions
tfidf("encyclopedia_sd_preprocessed.csv", "tfidf_encyclopedia_embeddings.csv")

Number of rows in resulting DataFrame: 378
Shape of the first vector: (378,)


In [None]:
# embeddings for Non-medical prompts
tfidf("Non_medical_prompts_preprocessed.csv", "tfidf_non_medical_prompts_embeddings.csv")

Number of rows in resulting DataFrame: 10
Shape of the first vector: (10,)
