In [4]:
"""
Pre-train embeddings using gensim w2v implementation (CBOW by default)
"""
from gensim.models import Word2Vec
import csv

class ProcessedIter(object):
    def __init__(self, filename):
        self.filename = filename
    
    def __iter__(self):
        with open(self.filename, 'r', encoding='utf-8') as f:
            r = csv.reader(f)
            next(r)  # skip header
            for row in r:
                # text is in index 2: [subject_id, hadm_id, text, labels]
                yield row[2].split()

def word_embeddings(notes_file, output_dir, embedding_size=100, min_count=0, n_iter=5):
    """
    Train word2vec embeddings
    
    Args:
        notes_file: Path to training data CSV
        output_dir: Directory to save the model
        embedding_size: Dimension of embeddings
        min_count: Minimum word frequency
        n_iter: Number of training epochs
    
    Returns:
        Path to saved model
    """
    modelname = "processed_full.w2v"
    sentences = ProcessedIter(notes_file)
    
    # Initialize Word2Vec model
    model = Word2Vec(
        vector_size=embedding_size,
        min_count=min_count,
        workers=4,
        epochs=n_iter,
        sg=0  # 0 = CBOW, 1 = Skip-gram
    )
    
    print(f"Building word2vec vocab on {notes_file}...")
    model.build_vocab(sentences)
    print(f"Vocabulary size: {len(model.wv)}")
    
    print("Training...")
    # Need to recreate iterator for training
    sentences = ProcessedIter(notes_file)
    model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs, report_delay=1)
    
    out_file = f'{output_dir}/{modelname}'
    print(f"Writing embeddings to {out_file}")
    model.save(out_file)
    
    return out_file


notes_file = 'mimicdata/mimic4_icd10/full_code/train_full.csv'
output_dir = 'mimicdata/mimic4_icd10/full_code'

w2v_file = word_embeddings(
    notes_file=notes_file,
    output_dir=output_dir,
    embedding_size=100,
    min_count=0,
    n_iter=5
)
print(f"\nWord embeddings saved to: {w2v_file}")

Building word2vec vocab on mimicdata/mimic4_icd10/full_code/train_full.csv...
Vocabulary size: 179129
Training...
Writing embeddings to mimicdata/mimic4_icd10/full_code/processed_full.w2v

Word embeddings saved to: mimicdata/mimic4_icd10/full_code/processed_full.w2v


In [5]:
"""
Extract word vectors from trained Word2Vec model for vocabulary
"""
from gensim.models import Word2Vec
from tqdm import tqdm

def gensim_to_embeddings(w2v_file, vocab_file, output_file):
    """
    Extract embeddings for words in vocabulary
    
    Args:
        w2v_file: Path to trained Word2Vec model
        vocab_file: Path to vocabulary file (one word per line)
        output_file: Path to save extracted embeddings
    """
    print(f"Loading Word2Vec model from {w2v_file}...")
    model = Word2Vec.load(w2v_file)
    
    print(f"Reading vocabulary from {vocab_file}...")
    vocab = []
    with open(vocab_file, 'r', encoding='utf-8') as vf:
        for line in vf:
            word = line.strip()
            if word:  # Skip empty lines
                vocab.append(word)
    
    print(f"Vocabulary size: {len(vocab)}")
    print(f"Extracting embeddings to {output_file}...")
    
    found = 0
    with open(output_file, 'w', encoding='utf-8') as of:
        for word in tqdm(vocab):
            if word in model.wv:
                vec = model.wv[word]
                of.write(word + ' ' + ' '.join(str(x) for x in vec) + '\n')
                found += 1
    
    print(f"Extracted embeddings for {found}/{len(vocab)} words")
    return output_file

if __name__ == "__main__":
    # Example usage
    w2v_file = 'mimicdata/mimic4_icd10/full_code/processed_full.w2v'
    vocab_file = 'mimicdata/mimic4_icd10/vocab.csv'
    output_file = 'mimicdata/mimic4_icd10/full_code/disch_full.embed'
    
    gensim_to_embeddings(w2v_file, vocab_file, output_file)

Loading Word2Vec model from mimicdata/mimic4_icd10/full_code/processed_full.w2v...
Reading vocabulary from mimicdata/mimic4_icd10/vocab.csv...
Vocabulary size: 69972
Extracting embeddings to mimicdata/mimic4_icd10/full_code/disch_full.embed...


100%|██████████████████████████████████████████████████████████████████████████████████████████| 69972/69972 [00:01<00:00, 38127.50it/s]

Extracted embeddings for 69972/69972 words





In [7]:
# Check the embeddings file
with open('mimicdata/mimic4_icd10/full_code/disch_full.embed', 'r') as f:
    lines = f.readlines()
    print(f"Total embeddings: {len(lines)}")
    print(f"First embedding:\n{lines[0][:200]}...")  # Show first 200 chars
    
    # Check dimensionality
    first_line = lines[0].strip().split()
    word = first_line[0]
    vector = first_line[1:]
    print(f"\nWord: {word}")
    print(f"Embedding dimension: {len(vector)}")

Total embeddings: 69972
First embedding:
name -8.202752 -0.38950026 0.13343066 3.8483334 -1.856634 3.5749114 -4.3719854 5.196245 0.68617827 -0.33850288 4.28877 1.0903579 -3.8043988 -0.2620326 -2.2970586 -4.132816 7.4117746 1.3347013 3.581652...

Word: name
Embedding dimension: 100


In [10]:
import os

def print_folder_structure(startpath, indent=0):
    for item in os.listdir(startpath):
        path = os.path.join(startpath, item)
        print('   ' * indent + '|-- ' + item)
        if os.path.isdir(path):
            print_folder_structure(path, indent + 1)

if __name__ == "__main__":
    current_directory = os.getcwd()
    print("Current Folder Structure:\n")
    print_folder_structure(current_directory)


Current Folder Structure:

|-- learn
   |-- interpret.py
   |-- models.py
   |-- tools.py
   |-- __init__.py
   |-- .ipynb_checkpoints
      |-- __init__-checkpoint.py
   |-- training.py
|-- persistence.py
|-- .DS_Store
|-- modelling
|-- Untitled.ipynb
|-- evaluation.py
|-- constants.py
|-- datasets.py
|-- data_mimic_IV_concat_note_label.py
|-- .ipynb_checkpoints
   |-- Untitled-checkpoint.ipynb
|-- mimicdata
   |-- .DS_Store
   |-- mimic4_icd10
      |-- full_code
         |-- .DS_Store
         |-- processed_full.w2v.wv.vectors.npy
         |-- vocab.csv
         |-- processed_full.w2v
         |-- dev_full.csv
         |-- train_full.csv
         |-- disch_full.embed
         |-- test_full.csv
         |-- processed_full.w2v.syn1neg.npy
         |-- .ipynb_checkpoints
            |-- dev_full-checkpoint.csv
      |-- .DS_Store
      |-- vocab.csv
      |-- .ipynb_checkpoints
      |-- top_50
         |-- top50_icd10_code_list.txt
         |-- .DS_Store
         |-- TOP_50_CODES.csv


In [9]:
# 1. Create dataproc folder in root
mkdir dataproc
touch dataproc/__init__.py

# 2. Create extract_wvs.py in dataproc folder
# (I'll provide the code below)

# 3. Move files from scripts/ to root directory
mv scripts/persistence.py .
mv scripts/evaluation.py .
mv scripts/constants.py .
mv scripts/datasets.py .

# 4. Your final structure should be:

SyntaxError: invalid syntax (3829211005.py, line 2)