In [5]:
#Required packages
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import re
import os
import numpy as np
from multiprocessing import cpu_count
from helper_files import *
import pickle
from scipy.linalg import svd
import pandas as pd

In [None]:
#Reads in text of reviews and makes a word2vec embedding of all the words
directory = "reviews_3mo"
data = read_reviews(directory) #We only want the review text
num_cores = cpu_count()
model = Word2Vec(data, size = 50, window = 5, min_count = 1, workers = num_cores)
model.save('word2vec_test.model')
model.wv.save('wordvecs_test.kv') #Save keyed vectors as well

In [3]:
#Functions for extracting keywords from a corpus
def gen_keywordValues(data):
    word_list = []
    for i,r in enumerate(data):
        for word in r:
            if word not in word_list:
                word_list.append(word)
    
    W = np.zeros((len(data), len(word_list)))
    for i,r in enumerate(data):
        for j,word in enumerate(r):
            W[i][j] = W[i][j]+1
    return(W,data,word_list)

def keyword_extraction(data, t = 5, k = 2):
    W,sentences,word_list = gen_keywordValues(data)
    if(k >= len(sentences)):
        k = len(sentences)
    if(t >= len(word_list)):
        t = len(word_list)
    u,s,v = svd(W)
    index = np.argmax(s)
    u = u[:,index]
    v = v[index,:]
    if all(i <= 0 for i in u): u = u*-1
    if all(i <= 0 for i in v): v = v*-1
    u_ind = np.argsort(u)
    v_ind = np.argsort(v)
    return([(word_list[w], v[w]) for w in v_ind[-t:]])

In [4]:
#Reads in text of reviews and saves a pickle file for each business containing a list of their keywords and weights
directory = "reviews_3mo"
results = read_dir(directory)
for r in results:
    business_id = r[1]
    data = r[0]
    print("Starting File: "+business_id)
    keywords = keyword_extraction(data)
    filename = business_id + "_keywords.pkl"
    with open("keywords/"+filename,'wb') as f:
        pickle.dump(keywords,f)

Starting File: -1vfRrlnNnNJ5boOVghMPA
Starting File: -AD5PiuJHgdUcAK-Vxao2A
Starting File: -BS4aZAQm9u41YnB9MUASA
Starting File: -C8sSrFqaCxp51pyo-fQLQ
Starting File: -CbDQXiuKzPQ0_jiUz03aw
Starting File: -ed0Yc9on37RoIoG2ZgxBA
Starting File: -FcZY7a7qgxTUlTvwuyJnQ
Starting File: -K82LBrI3H0FVuhTbNDpRA
Starting File: -n8PaipyFGNUJE4xlnrOAg
Starting File: -RJ216TTIghZshCkUlD1WQ
Starting File: -wCtRhzWJ40Z4F8mmg7kWg
Starting File: -_TSaVr53qiEGqMkwyEMaQ
Starting File: 07AZL5XenCQ_-op_onKLdw
Starting File: 07jQarxKd_8J_AJHD5YBqQ
Starting File: 0bWLkWeIcT-EnrE7SiGEBA
Starting File: 0EgYXYjt2XJL4hlsKnzrcw
Starting File: 0FHfkDxKSeDuyAEeaY6X-Q
Starting File: 0Hcks5q-DgJbDlmEWolx4A
Starting File: 0InA3Ffj8MxyQV9hgsyYcQ
Starting File: 0jJfQEWEe-6ODl62XSynDw
Starting File: 0keht9nVTliinChbGWWkkg
Starting File: 0nlocywsFHQmEt8_xbHNiw
Starting File: 0NmTwqYEQiKErDv4a55obg
Starting File: 0qet57CmMA5qUm6gPFUTpg
Starting File: 0qPNeqLuKE88rKbIbSZDpg
Starting File: 0VjHFdczi6Nln_nn8bucJQ
Starting Fil

Starting File: BslEhCyzaQPfpHtDogb4hQ
Starting File: BuHbMNbCQzzYi-3Y5w1c4Q
Starting File: bUnm0-YK5HicLzkvMWwqjQ
Starting File: bvGdqJ-SeGdIHX-HEabwlg
Starting File: bw7LEtZ5ozq8qOBA6FowYA
Starting File: bxCKBd1qsafnKXlWrCK20g
Starting File: By7P2EBBvhqoSDj8PnQa8g
Starting File: bz69msmKwRjWQzN-XwyV6Q
Starting File: B_WggEKFq-ZFNui8CHPYvA
Starting File: c-NXKTJ0jrrusTPxJAUwvA
Starting File: C-u6Ywuq192icOblKm6oKg
Starting File: C7X8VCpbBwTP2lFR354EIA
Starting File: C9ImzBi5fn742ZcAYDww2A
Starting File: CBAnjb88MojpentaNPQWVA
Starting File: CeqWpwHBoaxwRcv5btnv6g
Starting File: cgZOP1ZUBOZmEQMSJ53oiQ
Starting File: CiYLq33nAyghFkUR15pP-Q
Starting File: ckke8MRDHVLJdNa0quxOEg
Starting File: Cl-xl1vTUwHeaGgBxzdTRA
Starting File: coBLz73uWQp1gbMQGkvM3g
Starting File: cOJ1uIVIHCiefUyWG2wDfw
Starting File: CoyeXg8FBsS_d20QzNIy-A
Starting File: Cpb142L849ilMzgiSR4a0w
Starting File: CqVdLcbyZzfAjeCD1qEBZQ
Starting File: cSUWEbmMyJAv87Yd9LnvUA
Starting File: Cv6U-7HrrkKuh-_ec7EktQ
Starting Fil

Starting File: iTCH7y2KLRMKMxdEkgUK1g
Starting File: ItqPtxnayraXSlBS0EMOgg
Starting File: iuaFpqtQOMgX5y56lIUrxg
Starting File: IVnGPHdTyu_GbLo9mXj98w
Starting File: iw9p7E4bMe1YW-APcFGjgQ
Starting File: IWN2heYitkg-D4UdqfxcMA
Starting File: IZivKqtHyz4-ts8KsnvMrA
Starting File: j2w6wlRmlaZaaErQus8ElQ
Starting File: J7EUrh1q3lgENaoTrHZl_w
Starting File: j7HO1YeMQGYo3KibMXZ5vg
Starting File: jBh399TajGcH28Zo2J1pHw
Starting File: JC7HF7jRVqULhTOTE2n6ug
Starting File: JcFyuyo0Rt6gDiVZ9yJ3VQ
Starting File: JhQvfo_WfX10TG9_7VO14g
Starting File: JhRxDdmZSGDG_u7cGxaYrw
Starting File: JJAwT9R6Fp2yyWHtWFA8uQ
Starting File: JjjlhlQ2LEaQMuAug4fDIA
Starting File: JJzAOl3kQLfhOzB_vjf3FQ
Starting File: jLxeBgWhLRbII2ACkgH1Sg
Starting File: jnroKk9qi0FR2bFSkxYZNg
Starting File: jPvn-C8kNUcyGfjUtOc_oA
Starting File: JRh14J_be0jl7Wbt412vDA
Starting File: jSqfGuZXwcKBMWV71y11Tg
Starting File: Jt28TYWanzKrJYYr0Tf1MQ
Starting File: jXyy2Vc-Dgp7QguQJSfnXw
Starting File: Jy6ru0Hz-9Yc5v006-YmUQ
Starting Fil

Starting File: RVt3dKDevhEs0gVMYeL6_w
Starting File: rVZfapnQgyVLsNWA583u-A
Starting File: RwMLuOkImBIqqYj4SSKSPg
Starting File: rYG3Bbsf3u2P4N4ZxQXyKw
Starting File: S-egzF2c7bRpCUNbZqV8dg
Starting File: s0vNtuLs1cwEqHTXKiS05w
Starting File: s2oTzh21fWMXy4YrSAq_wQ
Starting File: S6GcR_UsJhVkNoypqfx7Lw
Starting File: SAIrNOB4PtDA4gziNCucwg
Starting File: sba0fwlKAZFGWWtsf5Jbwg
Starting File: SbS0cnxFStrlidtwek6b6w
Starting File: SDtIdchX9WpKlH5Yst8kjQ
Starting File: Se-co1eXBho5A4YwMfKMIQ
Starting File: SeNOJ2zYHziptxLuiRINLg
Starting File: sf3kp0H13jZYEmS1A8Etcw
Starting File: SGPjYRy6x7R5ZlGa_nrskg
Starting File: sHsjBV5_kuuC2aU2PuSlUg
Starting File: shSYKGuSzXpctURCR9kCFQ
Starting File: sIr0Wp13Z_MqwgcNvF-5ew
Starting File: sJBem81QF7l_43aF7xtLYQ
Starting File: SJU-jRAZS0cXoBGUjX5GUg
Starting File: SnMUTGhY2nW2Ldb0QJAjAg
Starting File: sNVGdeOPeitJ3OWUQBINzQ
Starting File: sOYsxYYFl03PhHmz_rBDZQ
Starting File: SQEzO0MiY6-M1VwbYcbWtg
Starting File: SqxIx0KbTmCvUlOfkjamew
Starting Fil

Starting File: zK7sltLeRRioqYwgLiWUIA
Starting File: ZMIcuo-aQja0ITk4-TU8Jg
Starting File: ZnUBkQrpRQDk_Y0hPtlgNw
Starting File: znWHLW1pt19HzW1VY6KfCA
Starting File: ZoCCW3R_LEAzktU0rMbX8w
Starting File: ZOmf-3NN4Z59b2Fw6VAM7g
Starting File: zpoZ6WyQUYff18-z4ZU1mA
Starting File: zRqi6L1u-YmmVAHjeUbGMQ
Starting File: ZuNJelvkJD4wsXNBRg6t5w
Starting File: ZW5ReCY3B9_TekUYWhr7HA
Starting File: zwNC-Ow4eIMan2__bS9-rg
Starting File: zXBrfL_ohYMuBZWvF8EcSA
Starting File: zY6f_cFY8HTI3hLlJj3SOg
Starting File: _7kVnn5_19ckuFPS9PCrpQ
Starting File: _8mBVtt6yhkS0YtT1FGBNg
Starting File: _hML_SleB40KkUrMHOZ_-Q
Starting File: _ixV2SWDy7w8jzEAHp85qA
Starting File: _JbuDLG7SFEtiCVDT1M0CQ
Starting File: _PQ1cKw_ZtKrXNxCMgIeYA
Starting File: _R76PYZqTRvw840SP6s5GA
Starting File: _RxbfEugCvI93ByH2Bg_oA
Starting File: _sh6mIBWZis66mAjkjN8Qg
Starting File: _V0yJdpXrbdKzBDoVSJGWA
Starting File: _w5hBpkjHs5_Hv3pLeHtIw
Starting File: _y1Et7f4NE6D1P1GzphlIw


In [9]:
#Code for converting keywords to vector
def create_vec(filename,kv):
    with open(filename,'rb') as f:
        keywords = pickle.load(f)
    keywords = keywords
    weights = [w[1] for w in keywords]
    norm = np.linalg.norm(weights)
    weights = weights/norm
    vec = []
    for k in keywords:
        word = k[0]
        if any(vec):
            vec = vec+ kv[word]
        else:
            vec = kv[word]
    return(vec)

def get_vecs(directory,kv_name):
    dic = {}
    kv = KeyedVectors.load(kv_name, mmap = 'r')
    for filename in os.listdir(directory):
        ind = filename.find('_keywords.pkl')
        business_id = filename[0:ind]
        file = directory + "/" + filename
        vec = create_vec(file,kv)
        dic.update({business_id:vec})
    df = pd.DataFrame.from_dict(dic)
    return(df)

In [10]:
directory = "keywords"
kv_name = "wordvecs_test.kv"
df = get_vecs(directory,kv_name)
file = input('Please input path/filename for vector embedding dataframe file: ')
with open(file,'wb') as f:
    pickle.dump(df,f)

NameError: name 'raw_input' is not defined