In [None]:
#Required packages
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import re
import os
import numpy as np
from multiprocessing import cpu_count
from helper_files import *
import pickle
from scipy.linalg import svd
import pandas as pd

In [None]:
#Reads in text of reviews and makes a word2vec embedding of all the words
directory = "reviews_3mo"
data = read_reviews(directory) #We only want the review text
num_cores = cpu_count()
model = Word2Vec(data, size = 50, window = 5, min_count = 1, workers = num_cores)
model.save('word2vec_test.model')
model.wv.save('wordvecs_test.kv') #Save keyed vectors as well

In [None]:
#Functions for extracting keywords from a corpus
def gen_keywordValues(data):
    word_list = []
    for i,r in enumerate(data):
        for word in r:
            if word not in word_list:
                word_list.append(word)
    
    W = np.zeros((len(data), len(word_list)))
    for i,r in enumerate(data):
        for j,word in enumerate(r):
            W[i][j] = W[i][j]+1
    return(W,data,word_list)

def keyword_extraction(data, t = 5, k = 2):
    W,sentences,word_list = gen_keywordValues(data)
    if(k >= len(sentences)):
        k = len(sentences)
    if(t >= len(word_list)):
        t = len(word_list)
    u,s,v = svd(W)
    index = np.argmax(s)
    u = u[:,index]
    v = v[index,:]
    if all(i <= 0 for i in u): u = u*-1
    if all(i <= 0 for i in v): v = v*-1
    u_ind = np.argsort(u)
    v_ind = np.argsort(v)
    return([(word_list[w], v[w]) for w in v_ind[-t:]])

In [None]:
#Reads in text of reviews and saves a pickle file for each business containing a list of their keywords and weights
directory = "reviews_3mo"
results = read_dir(directory)
for r in results:
    business_id = r[1]
    data = r[0]
    print("Starting File: "+business_id)
    keywords = keyword_extraction(data)
    filename = business_id + "_keywords.pkl"
    with open("keywords/"+filename,'wb') as f:
        pickle.dump(keywords,f)

In [None]:
#Code for converting keywords to vector
def create_vec(filename,kv):
    with open(filename,'rb') as f:
        keywords = pickle.load(f)
    keywords = keywords
    weights = [w[1] for w in keywords]
    norm = np.linalg.norm(weights)
    weights = weights/norm
    vec = []
    for k in keywords:
        word = k[0]
        if any(vec):
            vec = vec+ kv[word]
        else:
            vec = kv[word]
    return(vec)

def get_vecs(directory,kv_name):
    dic = {}
    kv = KeyedVectors.load(kv_name, mmap = 'r')
    for filename in os.listdir(directory):
        ind = filename.find('_keywords.pkl')
        business_id = filename[0:ind]
        file = directory + "/" + filename
        vec = create_vec(file,kv)
        dic.update({business_id:vec})
    df = pd.DataFrame.from_dict(dic)
    return(df)

In [None]:
directory = "keywords"
kv_name = "wordvecs_test.kv"
df = get_vecs(directory,kv_name)
file = input('Please input path/filename for vector embedding dataframe file: ')
with open(file,'wb') as f:
    pickle.dump(df,f)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
#Visualize word embeddings
word_vecs = KeyedVectors.load('wordvecs_test.kv')
words = word_vecs.vocab
X = word_vecs[words]

In [None]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
tsne_df = pd.DataFrame(X_tsne, index=words, columns=['x', 'y'])

In [None]:
word_list = ['japanese', 'pizza', 'sushi', 'italian', 'mexican', 'chinese', 'delicious', 'cheap', 'expensive', 'car',
             'bike', 'clean', 'quick', 'forever', 'wait', 'taste', 'burger', 'dog']
ind = [words.index(w) for w in word_list]
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.scatter(tsne_df['x'][ind], tsne_df['y'][ind])
for word in word_list:
    ax.annotate(word, df['x'][words.index(word)])
plt.show()

In [None]:
for word in word_list:
    sim = word_vecs.most_similar(positive = word)
    print(word)
    print(sim)

In [None]:
#Get ratings for TSNE OF keyword embeddings
directory = 'reviews_3mo'
data = read_dir(directory)
ratings = {}
for d in data:
    business_id = d[1]
    rating = float(d[2])
    ratings.update({business_id:rating})

In [None]:
#Visualize tsne of keyword embeddings
with open('word2vec_keywordEmbeddings.pkl') as f:
    data = pickle.load(f)
cols = list(data.columns)
X = data[cols]
c = [ratings[c] for c in cols]
tsne = TSNE(n_components = 2)
X_tsne = tsne.fit_transform(X)
tsne_df1 = pd.DataFrame(X_tsne,columns = ['x','y'])
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
plot = ax.scatter(tsne_df1['x'],tsne_df1['y'], c = c)
plt.colorbar(plot)
plt.show()