In [80]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import string
import requests
import collections
import io
import tarfile
import urllib.request
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
sess = tf.Session()
import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
# define hyper paramaters

batch_size=50  #一个训练批次的大小
embedding_size=200 #将word 映射成1*200 的vector
vocabulary_size=1000 #词汇库大小
generations=50000    #train iteration
print_loss_every=500  #每500次打印一次loss
num_sampled=int(batch_size/2) #
window_size=2  #n-skipGram n=2
stops=stopwords.words("english")  #不具备有用信息的词汇
print_valid_every = 2000
valid_words = [ 'love', 'hate', 'silly', 'sad']

In [40]:
def load_movie_data():
  pos_data =list(open('./temp/rt-polarity.pos','r',encoding='Windows-1252').readlines())
  neg_data = list(open("./temp/rt-polarity.neg",'r',encoding='Windows-1252').readlines())
  texts = pos_data + neg_data
  target = [1]*len(pos_data) + [0]*len(neg_data)     
  return texts,target        
texts, target = load_movie_data()

'its scenes and sensibility are all more than familiar , but it exudes a kind of nostalgic spy-movie charm and , at the same time , is so fresh and free of the usual thriller nonsense that it all seems to be happening for the first time . \n'

In [42]:
def normalize_text(texts, stops):
 texts = [x.lower() for x in texts]
 texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
 texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
 texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]
 texts = [' '.join(x.split()) for x in texts]
 return(texts)
texts = normalize_text(texts, stops)

#去除所有长度小于2的sentence
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]


In [58]:
def build_dictionary(sentences, vocabulary_size):
    split_sentences = [s.split() for s in sentences] 
    words = [x  for sublist in split_sentences for x in sublist]
    count = [['RARE', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1)) 
    #找到出现频率为top （vocabulary_size） 单词
    word_dict = {} #字典，单词出现的index
    for word, word_count in count:
      word_dict[word] = len(word_dict)
    return(word_dict)
word_dictionary = build_dictionary(texts, vocabulary_size)

In [59]:
#sentence=>[indexs of word]
def text_to_numbers(sentences, word_dict):
 # Initialize the returned data
 data = []
 for sentence in sentences:
   sentence_data = []
   # For each word, either use selected index or rare word index
   for word in sentence:
     if word in word_dict:
        word_ix = word_dict[word]
     else:
        word_ix = 0
     sentence_data.append(word_ix)
   data.append(sentence_data)
 return(data)
split_sentences = [s.split() for s in texts] 
text_data = text_to_numbers(split_sentences, word_dictionary)

In [60]:
#cliche 不在vocabulary 中
valid_examples = [word_dictionary[x]  if x  in word_dictionary.keys() else 0 for x in valid_words]

In [74]:
def generate_batch_data(sentences, batch_size, window_size):
 batch_data = []
 label_data = []
 while len(batch_data) < batch_size:
   rand_sentence = np.random.choice(sentences)

   window_sequences = [
         rand_sentence[max((ix-window_size),0):(ix+window_size+1)]
         for ix, x in enumerate(rand_sentence)
                      ] 
    
   label_indices = [ix if ix<window_size else window_size 
                  for ix,x in enumerate(window_sequences)]

   batch_and_labels = [
       (x[y], x[:y] + x[(y+1):]) 
       for x,y in zip(window_sequences, label_indices)]

   tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
   
   batch, labels = [list(x) for x in zip(*tuple_data)]
   #zip(*tuple_data) 
   batch_data.extend(batch[:batch_size])  #装入bacth中
   label_data.extend(labels[:batch_size]) 
   
 batch_data = batch_data[:batch_size] #再次整理
 label_data = label_data[:batch_size]
 
 batch_data = np.array(batch_data)
 label_data = np.transpose(np.array([label_data]))
 return batch_data,label_data

In [75]:
#define model

embeddings=tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1))
x_inputs = tf.placeholder(tf.int32, shape=[batch_size])
y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

embed = tf.nn.embedding_lookup(embeddings, x_inputs)



In [76]:
#nce loss
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / np.sqrt(embedding_size)),dtype=tf.float32)
nce_biases = tf.Variable(tf.zeros([vocabulary_size]),dtype=tf.float32)
loss=tf.reduce_mean(tf.nn.nce_loss(
    weights=nce_weights,
    biases=nce_biases,
    inputs=embed,
    labels=y_target,
    num_sampled=num_sampled,
    num_classes=vocabulary_size
))

In [77]:
#通过计算余弦相似度来验证embedding后的vector
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm #标准化embedding
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [78]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
init = tf.initialize_all_variables()
sess.run(init)

In [79]:
loss_vec = []
loss_x_vec = []
for i in range(generations):
 batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size)
 feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}
 # Run the train step
 sess.run(optimizer, feed_dict=feed_dict)
 # Return the loss
 if (i+1) % print_loss_every == 0:
     loss_val = sess.run(loss, feed_dict=feed_dict)
     loss_vec.append(loss_val)
     loss_x_vec.append(i+1)
     print("Loss at step {} : {}".format(i+1, loss_val))



Loss at step 500 : 3.081111431121826
Loss at step 1000 : 4.658157825469971
Loss at step 1500 : 3.3214972019195557
Loss at step 2000 : 3.5877716541290283
Loss at step 2500 : 3.472334623336792
Loss at step 3000 : 4.921047687530518
Loss at step 3500 : 3.141298770904541
Loss at step 4000 : 4.358353614807129
Loss at step 4500 : 3.316549301147461
Loss at step 5000 : 4.208366870880127
Loss at step 5500 : 4.06997013092041
Loss at step 6000 : 2.798509120941162
Loss at step 6500 : 3.2893781661987305
Loss at step 7000 : 4.089358806610107
Loss at step 7500 : 5.819401741027832
Loss at step 8000 : 3.426023244857788
Loss at step 8500 : 3.009683132171631
Loss at step 9000 : 3.3406569957733154
Loss at step 9500 : 2.933220148086548
Loss at step 10000 : 3.3348610401153564
Loss at step 10500 : 3.4028470516204834
Loss at step 11000 : 3.164668083190918
Loss at step 11500 : 3.6683080196380615
Loss at step 12000 : 2.7962236404418945
Loss at step 12500 : 3.6083927154541016
Loss at step 13000 : 2.81741857528686

In [81]:
model_checkpoint_path = os.path.join("temp/movie_data/",'cbow_movie_embeddings.ckpt')
saver = tf.train.Saver({"embeddings": embeddings})
saver.restore(sess, model_checkpoint_path)
with open(os.path.join("temp/movie_data/",'movie_vocab.pkl'), 'wb') as f:
  pickle.dump(word_dictionary, f)
model_checkpoint_path = os.path.join(os.getcwd(),"temp/movie_data/",'cbow_movie_embeddings.ckpt')
save_path = saver.save(sess, model_checkpoint_path)



Instructions for updating:
Use standard file APIs to check for files with this prefix.


ValueError: The passed save_path is not a valid checkpoint: temp/movie_data/cbow_movie_embeddings.ckpt