# Record Embedding

In [1]:
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

## Embedding using fastText
Details here: https://fasttext.cc/

In [2]:
import warnings
warnings.filterwarnings('ignore')
import fastText
from fastText import train_unsupervised
import gensim
from gensim.models import FastText

Convert each tuple into a row

In [3]:

#df = pd.read_csv('truth_values_1100_tuples.csv',dtype=object, encoding='utf8')
df = pd.read_csv("adult.csv",dtype=object, encoding='utf8')
df = df.sample(frac=1).reset_index(drop=True)
df.head(3)

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,educationnum,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry,income
0,12774,25,private,57635,assoc-voc,11,married-civ-spouse,sales,wife,white,female,0,0,42,united-states,>50k
1,32523,27,private,177398,hs-grad,9,never-married,other-service,unmarried,white,female,0,0,64,united-states,<=50k
2,12905,43,private,162108,bachelors,13,never-married,adm-clerical,not-in-family,white,female,0,0,40,united-states,<=50k


In [4]:
#training the model with truth data.
import tempfile
data = []
dfList = df.values.tolist()
for line in dfList:
    line = [l.lower() for l in line]
    data.append(line)

with tempfile.NamedTemporaryFile(delete=False) as tmpf:
            for line in dfList:
                line = [l.lower() for l in line]
                tmpf.write("{}\n".format(line).encode())
            tmpf.flush()

In [5]:
#model_hosp = train_unsupervised(input=tmpf.name, model='skipgram', ws=df.shape[1])

In [6]:
model_census = FastText(data, min_count=1, workers=8, iter=500, window=df.shape[1], sg=1)

from numpy import dot
from numpy.linalg import norm

class FastTextNN:
    def __init__(self, ft_model, ft_matrix=None):
        self.ft_model = ft_model        
        self.ft_words = ft_model.get_words()
        #print(self.ft_words)
        self.word_frequencies = dict(zip(*ft_model.get_words(include_freq=True)))
        #print(self.word_frequencies)
        self.ft_matrix = ft_matrix
        if self.ft_matrix is None:
            self.ft_matrix = np.empty((len(self.ft_words), ft_model.get_dimension()))
            for i, word in enumerate(self.ft_words):
                self.ft_matrix[i,:] = ft_model.get_word_vector(word)
                
    def find_nearest_neighbor(self, query, vectors, n=10,  cossims=None):
        """
        query is a 1d numpy array corresponding to the vector to which you want to
        find the closest vector
        vectors is a 2d numpy array corresponding to the vectors you want to consider

        cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
        returns the index of the closest n matches to query within vectors and the cosine similarity (cosine the angle between the vectors)

        """
        at = []
        if cossims is None:
            minVal = 32
            #print(vectors.shape)
            cossims = np.matmul(vectors, query, out=cossims)
            for i in range(len(self.ft_words)):
                a = dot(query, self.ft_model.get_word_vector(self.ft_words[i]))/(norm(query)*norm(self.ft_model.get_word_vector(self.ft_words[i])))
                if a < range(minVal, minVal+10):
                    minVal = a
                    at.append(a, self.ft_words[i])
        
        norms = np.sqrt((query**2).sum() * (vectors**2).sum(axis=1))
        cossims = cossims/norms
        result_i = np.argpartition(-cossims, range(n+1))[1:n+1]
        return list(zip(result_i, cossims[result_i]))

    def nearest_words(self, word, n=15, word_freq=None):
        result = self.find_nearest_neighbor(self.ft_model.get_word_vector(word), self.ft_matrix, n=n)
        
        if word_freq:
            return [(self.ft_words[r[0]], r[1]) for r in result if self.word_frequencies[self.ft_words[r[0]]] >= word_freq]
        else:
            return [(self.ft_words[r[0]], r[1]) for r in result]
nn = FastTextNN(model_hosp)
nn.nearest_words('yxs', word_freq=15)

In [7]:
model_census.most_similar("some-colage".lower())

[('some-college', 0.9959443211555481),
 ('10', 0.8455513715744019),
 ('bachelors', 0.4885871708393097),
 ('10th', 0.4875778555870056),
 ('assoc-acdm', 0.4788847863674164),
 ('assoc-voc', 0.4679119288921356),
 ('masters', 0.46051493287086487),
 ('11th', 0.4583005905151367),
 ('7th-8th', 0.449377179145813),
 ('hs-grad', 0.42940622568130493)]

In [8]:
model_census.save("AdultFastText.w2v")