In [1]:
import cPickle as pickle
import os, sys, multiprocessing, time
import tensorflow as tf
sys.path.append('/Users/matt.meng/dev/word2dev_model')
from graph_model import word2vec
from model_utils import create_local_model_path, create_local_log_path


In [13]:
def build_word2vec_model(model_name):
    
    NUM_THREADS = 2*multiprocessing.cpu_count()-1
    COMMON_PATH = os.path.join(os.path.expanduser("~"), 'local_tensorflow_content')
    
    model_config = {}
    model_config['model_name'] = model_name
    model_config['restore_model'] = True
    model_config['eval_mode'] = True

    use_gpu = False
    if use_gpu:
        model_config['sess_config'] = tf.ConfigProto(log_device_placement=False,
                                                     gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5))
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # the only way to completely not use GPU
        model_config['sess_config'] = tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)

    model_config['model_path'] = create_local_model_path(COMMON_PATH, model_config['model_name'])
    model_config['log_path'] = create_local_log_path(COMMON_PATH, model_config['model_name'])

    model = word2vec(**model_config)
    return model

def collect_key_from_pickle_file(titles_pickle_file, title_key):
    pickle_file_path = os.path.join(os.path.expanduser("~"), titles_pickle_file)

    with open(pickle_file_path, 'rb') as input_stream:
        data = pickle.load(input_stream)
    return data[title_key]


def collect_multi_keys_from_pickle_file(titles_pickle_file, key_dict):
    pickle_file_path = os.path.join(os.path.expanduser("~"), titles_pickle_file)

    with open(pickle_file_path, 'rb') as input_stream:
        data = pickle.load(input_stream)
        
    content_dict = {}
    for key in key_dict.keys():
        content_dict[key] = data[key_dict[key]]
    return content_dict


class ProcessedTitle(object):
    
    def __init__(self, index_title, url, pageView):
        self.index_title = index_title
        self.url = url
        self.pageView = pageView
        title_array = map(ProcessedTitle.reverse_token_dict.get, self.index_title)
        self.title = " ".join(title_array) 
        
    def create_word2vec_embeddings(self, word2vec_model):
        max_vector, min_vector, mean_vector = word2vec_model.predict(self.index_title)
        self.max_vector = max_vector
        self.min_vector = min_vector
        self.mean_vector = mean_vector
        

def create_title_dict_with_word2vec(content_dict, model):
    processed_titles = []
    cur_time = time.time()
    fixed_couner = 1000
    title_limit = 10000
    ProcessedTitle.reverse_token_dict = content_dict['reverse_token_dict']
    for i in xrange(title_limit):
    #for i in xrange(len(content_dict['titles'])):
        title = ProcessedTitle(index_title=content_dict['titles'][i], 
                               url=content_dict['url'][i], 
                               pageView=content_dict['pageView'][i])
        title.create_word2vec_embeddings(model)
        processed_titles.append(title)
        if i != 0 and i % fixed_couner == 0:
            print "processing {} titles using {:.2f} seconds".format(fixed_couner, time.time()-cur_time)
            cur_time = time.time()
    return processed_titles


In [3]:
titles_pickle_file = 'lemmanized_no_stop_words_scrambled_titles.pkl'

expected_keys = {"titles": 'titles', "url": 'url', 'pageView': "pageViw", 'reverse_token_dict': 'reverse_token_dict'}
lemmatized_expected_keys = {"titles": 'target_titles', "url": 'url', 'pageView': "pageViw", 'reverse_token_dict': 'reverse_token_dict'}

content_dict = collect_multi_keys_from_pickle_file(titles_pickle_file, lemmatized_expected_keys)

In [4]:
word2vec_model = build_word2vec_model()

INFO:tensorflow:Restoring parameters from /Users/matt.meng/local_tensorflow_content/word2vec/models-1500
restore trained models from /Users/matt.meng/local_tensorflow_content/word2vec
restore model from step:  1500


In [5]:
processed_titles = create_title_dict_with_word2vec(content_dict, word2vec_model)

processing 1000 titles using 29.10 seconds
processing 1000 titles using 30.48 seconds
processing 1000 titles using 27.58 seconds
processing 1000 titles using 27.48 seconds
processing 1000 titles using 29.54 seconds
processing 1000 titles using 31.50 seconds
processing 1000 titles using 36.84 seconds
processing 1000 titles using 32.05 seconds
processing 1000 titles using 29.59 seconds


In [6]:
sorted_titles = sorted(processed_titles, key=lambda x: x.pageView, reverse=True)

In [12]:
index = 10
print sorted_titles[index].title
print sorted_titles[index].max_vector

trump humiliate include john mccain
[ -2.58553941e-02   1.22627921e-01   1.15935944e-01  -5.43628726e-03
   1.08333386e-01   8.71216878e-02  -8.59051506e-05   1.53111294e-01
  -6.82163984e-02   8.53824541e-02  -2.91144568e-02   1.24870621e-01
   1.47591799e-01   1.33838326e-01  -7.23624676e-02  -6.17905147e-02
  -3.55449095e-02  -6.23437613e-02  -1.95119996e-02   1.13402098e-01
  -5.27736917e-02   1.35000676e-01   8.80233049e-02   9.40252393e-02
   1.20826170e-01   5.60505837e-02   1.38159335e-01   8.52120072e-02
   1.05636202e-01   1.23074263e-01  -2.69965511e-02   1.33987054e-01
   1.21170469e-01   1.54107377e-01   1.18402325e-01   6.94712549e-02
  -1.11002603e-03  -7.90434889e-03   1.48462862e-01   3.74956131e-02
   8.38991776e-02   7.41552189e-02  -7.84530938e-02   9.15036188e-04
   5.75002469e-02   1.00514486e-01   1.53303489e-01   1.29521772e-01
   8.82143527e-02  -5.18808514e-03   1.20590270e-01   9.73786041e-02
   1.26380906e-01   7.26668462e-02   8.70950967e-02   1.09721191e-0

In [None]:
processed_titles[2].title

In [None]:
word2vec_model

In [None]:
titles_pickle_file = 'lemmanized_no_stop_words_processed_titles.pkl'
pickle_file_path = os.path.join(os.path.expanduser("~"), titles_pickle_file)

with open(pickle_file_path, 'rb') as input_stream:
    data = pickle.load(input_stream)

print data.keys()
titles = data['titles']
reverse_token_dict = data['reverse_token_dict']
title_urls = data['url']
title_pageViews = data['pageViw']

In [None]:
print len(titles)

In [None]:
titles[:5]

#### work on the scrambled data

In [None]:
titles_pickle_file = 'lemmanized_no_stop_words_scrambled_titles.pkl'
pickle_file_path = os.path.join(os.path.expanduser("~"), titles_pickle_file)

with open(pickle_file_path, 'rb') as input_stream:
    tmp_data = pickle.load(input_stream)


In [None]:
content_dict.keys()

In [None]:
print tmp_data.keys()

In [None]:
print len(tmp_data['target_titles'])

In [None]:
tmp_data['target_titles'][:5]