In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from gensim.models import FastText
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from tqdm import tqdm

In [None]:
# Setup
nltk.download('punkt')
nltk.download('stopwords')
tqdm.pandas()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gleblegotkin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gleblegotkin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import pandas as pd
import numpy as np
from gensim.models import FastText
from tqdm import tqdm

def fasttext(file_in, file_out):
    # Load cleaned data
    df = pd.read_csv(file_in)

    # Tokenize the cleaned text (assumed already preprocessed)
    df['tokens'] = df['vectorization_text'].fillna('').apply(lambda x: x.split())

    # Prepare corpus
    corpus = df['tokens'].tolist()

    # Train FastText model
    fasttext_model = FastText(
        sentences=corpus,
        vector_size=768,
        window=5,
        min_count=1,
        workers=4,
        sg=1,  # use skip-gram
        seed=42
    )

    # Compute average embeddings
    tqdm.pandas(desc="Computing embeddings")
    def get_avg_embedding(tokens):
        vectors = [fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(fasttext_model.vector_size)

    df['Vector'] = df['tokens'].progress_apply(get_avg_embedding)

    # Format for CSV
    df_embeddings = pd.DataFrame({
        "ICD11_code": df["code"],
        "Vector": [np.array2string(vec, separator=',', precision=6, suppress_small=True) for vec in df['Vector']]
    })

    # Print sizes
    print(f"Number of rows in resulting DataFrame: {len(df_embeddings)}")
    print(f"Shape of the first vector: {df['Vector'].iloc[0].shape}")

    # Save to CSV
    df_embeddings.to_csv(file_out, index=False)


In [None]:
# embeddings for ICD11 codes
fasttext("ICD11_preprocessed.csv", "fasttext_ICD11_embeddings.csv")

Computing embeddings: 100%|█████████████| 13062/13062 [00:02<00:00, 6049.84it/s]


Number of rows in resulting DataFrame: 13062
Shape of the first vector: (768,)


In [None]:
# embeddings for Merriam-Webster definitions
fasttext("encyclopedia_sd_preprocessed.csv", "fasttext_encyclopedia_embeddings.csv")

Computing embeddings: 100%|████████████████| 378/378 [00:00<00:00, 17070.21it/s]


Number of rows in resulting DataFrame: 378
Shape of the first vector: (768,)


In [None]:
# embeddings for Non-medical prompts
fasttext("Non_medical_prompts_preprocessed.csv", "fasttext_non_medical_prompts_embeddings.csv")

Computing embeddings: 100%|████████████████████| 10/10 [00:00<00:00, 307.77it/s]


Number of rows in resulting DataFrame: 10
Shape of the first vector: (768,)
