# Record Embedding

In [1]:
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

## Embedding using fastText
Details here: https://fasttext.cc/

In [2]:
import warnings
warnings.filterwarnings('ignore')
import fastText
from fastText import train_unsupervised
import gensim
from gensim.models import FastText

Convert each tuple into a row

In [3]:

#df = pd.read_csv('truth_values_1100_tuples.csv',dtype=object, encoding='utf8')
df = pd.read_csv("clean_hospital_dataset_hc.csv",dtype=object, encoding='utf8')
for i in range(0, 3):
    df.append(df)

df = df.sample(frac=1).reset_index(drop=True)
df.head(3)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,State,ZipCode,CountyName,PhoneNumber,HospitalType,HospitalOwner,EmergencyService,Condition,MeasureCode,MeasureName,Score
0,10027,elba general hospital,987 drayton street,elba,al,36323,coffee,3348972257,acute care hospitals,voluntary non-profit - other,yes,heart attack,ami-3,heart attack patients given ace inhibitor or a...,empty
1,10039,huntsville hospital,101 sivley rd,huntsville,al,35801,madison,2562651000,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-3,heart failure patients given ace inhibitor or ...,98%
2,10009,hartselle medical center,201 pine street northwest,hartselle,al,35640,morgan,2567736511,acute care hospitals,proprietary,yes,heart failure,hf-4,heart failure patients given smoking cessation...,100%


In [4]:
#training the model with truth data.
import tempfile
data = []
dfList = df.values.tolist()
for line in dfList:
    line = [l.lower() for l in line]
    data.append(line)

with tempfile.NamedTemporaryFile(delete=False) as tmpf:
            for line in dfList:
                line = [l.lower() for l in line]
                tmpf.write("{}\n".format(line).encode())
            tmpf.flush()

In [None]:
#model_hosp = train_unsupervised(input=tmpf.name, model='skipgram', ws=df.shape[1])

In [None]:
model_hosp1 = FastText(data, min_count=1, workers=8, iter=1000, window=df.shape[1], sg=1, word_ngrams=1)

from numpy import dot
from numpy.linalg import norm

class FastTextNN:
    def __init__(self, ft_model, ft_matrix=None):
        self.ft_model = ft_model        
        self.ft_words = ft_model.get_words()
        #print(self.ft_words)
        self.word_frequencies = dict(zip(*ft_model.get_words(include_freq=True)))
        #print(self.word_frequencies)
        self.ft_matrix = ft_matrix
        if self.ft_matrix is None:
            self.ft_matrix = np.empty((len(self.ft_words), ft_model.get_dimension()))
            for i, word in enumerate(self.ft_words):
                self.ft_matrix[i,:] = ft_model.get_word_vector(word)
                
    def find_nearest_neighbor(self, query, vectors, n=10,  cossims=None):
        """
        query is a 1d numpy array corresponding to the vector to which you want to
        find the closest vector
        vectors is a 2d numpy array corresponding to the vectors you want to consider

        cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
        returns the index of the closest n matches to query within vectors and the cosine similarity (cosine the angle between the vectors)

        """
        at = []
        if cossims is None:
            minVal = 32
            #print(vectors.shape)
            cossims = np.matmul(vectors, query, out=cossims)
            for i in range(len(self.ft_words)):
                a = dot(query, self.ft_model.get_word_vector(self.ft_words[i]))/(norm(query)*norm(self.ft_model.get_word_vector(self.ft_words[i])))
                if a < range(minVal, minVal+10):
                    minVal = a
                    at.append(a, self.ft_words[i])
        
        norms = np.sqrt((query**2).sum() * (vectors**2).sum(axis=1))
        cossims = cossims/norms
        result_i = np.argpartition(-cossims, range(n+1))[1:n+1]
        return list(zip(result_i, cossims[result_i]))

    def nearest_words(self, word, n=15, word_freq=None):
        result = self.find_nearest_neighbor(self.ft_model.get_word_vector(word), self.ft_matrix, n=n)
        
        if word_freq:
            return [(self.ft_words[r[0]], r[1]) for r in result if self.word_frequencies[self.ft_words[r[0]]] >= word_freq]
        else:
            return [(self.ft_words[r[0]], r[1]) for r in result]
nn = FastTextNN(model_hosp)
nn.nearest_words('yxs', word_freq=15)

In [None]:
model_hosp.get_word_vector("yas")

In [None]:
model_hosp1.most_similar("a")

In [None]:
model_hosp1.save("HospitalFastText.w2v")