In [1]:
import pandas as pd
import numpy as np
import sys
import json
import time
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD

In [2]:
def read_singleton_dict(dictionary_dir):
    dic, index, unigram = {}, 0, []
    with open(dictionary_dir, 'r') as file:
        lines = file.readlines()
        for row in lines:
            line = row.split('\t')
            dic[line[0].strip()] = [int(line[1].strip()), index]
            unigram.append(index)
            index += 1
        return (dic, unigram)

In [3]:
path_singlets = '2_Singleton_Frequency_Counts/singlets_concepts_perBin_1d.txt'
singleton, unigram_table = read_singleton_dict(path_singlets)

In [4]:
def read_cofreq_dict(dictionary_dir):
    dic = {}
    with open(dictionary_dir, 'r') as file:
        lines = file.readlines()
        for row in lines:
            line = row.split('\t')
            dic[(line[0].strip(), line[1].strip()) ] = int(line[2].strip())
        return dic

In [5]:
path_cofreq = '1_Cofrequency_Counts/cofreqs_concepts_perBin_1d.txt'
cofreq = read_cofreq_dict(path_cofreq)

In [6]:
vocab = list(singleton.keys())

In [7]:
note_counts = int(7.334261*1.99*10**6)

In [8]:
def calc_SPPMI(word_context, k=1, SPPMI=True):
    word, context = word_context[0], word_context[1]
    word_counts, context_counts = singleton.get(word, [0])[0], singleton.get(context, [0])[0]
    word_context_counts = cofreq.get(word_context, 0)
    PMI = np.log((word_context_counts * note_counts)/(word_counts * context_counts))
    SPMI = PMI - np.log(k)
    SPPMI = max(SPMI, 0) if SPPMI else SPMI
    return SPPMI

In [9]:
def create_matrix(matrix, SPPMI=True):
    for word_context in cofreq.keys():
        SPPMI = calc_SPPMI(word_context, 1, SPPMI)
        word_index, context_index = singleton.get(word_context[0])[1], singleton.get(word_context[1])[1]
        matrix[word_index, context_index] = SPPMI

In [10]:
cofreq_matrix = np.empty((len(vocab), len(vocab)))
create_matrix(cofreq_matrix)

In [11]:
neg_cofreq_matrix = np.empty((len(vocab), len(vocab)))
create_matrix(neg_cofreq_matrix, False)

### SVD Factorization

In [14]:
word_vector_size, n_iter, random_state = 100, 7, 42
svd = TruncatedSVD(n_components=word_vector_size, n_iter=n_iter, random_state=random_state)

In [15]:
X = coo_matrix(cofreq_matrix)
X_transform = svd.fit_transform(X)

In [16]:
# print(svd.explained_variance_ratio_) 
print(svd.explained_variance_ratio_.sum())

0.6825433862664219


In [17]:
def get_embedding_dir(ts):
    base_dir = 'embeddings/'
    config_dir = base_dir + str(ts) + '_config' + '.txt'
    embedding_dir = base_dir + str(ts) + '_embedding' + '.txt'
    
    config = {'singlets_bin_category': path_singlets,
              'cofreq_bin_category': path_cofreq,
              'word_vector_size': word_vector_size,
              'n_iter': n_iter,
              'random_state': random_state,
              'embedding_dir': embedding_dir}
    
    with open(config_dir, 'w') as f:
        json.dump(config, f)
    return embedding_dir

def save_embedding(matrix, ts):
    embedding_dir = get_embedding_dir(ts)
    embedding_file = open(embedding_dir, "w")
    dictionary_keys = list(singleton.keys())
    line = str(len(dictionary_keys)) + ' ' + str(word_vector_size) + '\n'
    embedding_file.write(line)
    
    for i in dictionary_keys:
        cui, cui_index = i, singleton[i][1]
        weights = matrix[cui_index, :]
        line = str(cui) + ' ' + ' '.join(map(str, weights)) + '\n'
        embedding_file.write(line)
    embedding_file.close()
    print('Embedding file saved at ' + embedding_dir)

In [20]:
ts = int(time.time())
save_embedding(X_transform, ts) # after 

Embedding file saved at embeddings/1525323781_embedding.txt


### Evaluation