<a href="https://colab.research.google.com/github/pjhool/Deep-Learning-with-Keras/blob/master/Word_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams

text = "I love green eggs and ham ."

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

wids = [word2id[w] for w in text_to_word_sequence(text)]
pairs, labels = skipgrams(wids, len(word2id))
print(len(pairs), len(labels))
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        id2word[pairs[i][0]], pairs[i][0], 
        id2word[pairs[i][1]], pairs[i][1],
        labels[i]))

Using TensorFlow backend.


56 56
(love (2), green (3)) -> 1
(i (1), i (1)) -> 0
(love (2), eggs (4)) -> 0
(green (3), green (3)) -> 0
(i (1), eggs (4)) -> 1
(green (3), green (3)) -> 0
(and (5), and (5)) -> 0
(ham (6), love (2)) -> 1
(eggs (4), and (5)) -> 0
(and (5), ham (6)) -> 1


In [0]:

# visualize Word Embedding 

# https://github.com/dennybritz/deeplearning-papernotes/blob/master/notes/document-embedding-with-pv.md 


In [4]:
# download coprus 
! mkdir ../data 
!wget http://mattmahoney.net/dc/text8.zip   
!unzip text8.zip 
!cp text8 ../data

--2019-03-11 10:54:45--  http://mattmahoney.net/dc/text8.zip
Resolving mattmahoney.net (mattmahoney.net)... 67.195.197.75
Connecting to mattmahoney.net (mattmahoney.net)|67.195.197.75|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31344016 (30M) [application/zip]
Saving to: ‘text8.zip.1’


2019-03-11 10:54:59 (2.16 MB/s) - ‘text8.zip.1’ saved [31344016/31344016]

Archive:  text8.zip
  inflating: text8                   


In [7]:
!ls -al ../data

total 97668
drwxr-xr-x 2 root root      4096 Mar 11 10:55 .
drwxr-xr-x 1 root root      4096 Mar 11 10:54 ..
-rw-r--r-- 1 root root 100000000 Mar 11 10:55 text8


In [6]:
from gensim.models import word2vec
import os
import logging

class Text8Sentences(object):
    def __init__(self, fname, maxlen):
        self.fname = fname
        self.maxlen = maxlen
        
    def __iter__(self):
        with open(os.path.join(DATA_DIR, "text8"), "rb") as ftext:
            text = ftext.read().split(" ")
            words = []
            for word in text:
                if len(words) >= self.maxlen:
                    yield words
                    words = []
                words.append(word)
            yield words

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

DATA_DIR = "../data/"
sentences = Text8Sentences(os.path.join(DATA_DIR, "text8"), 50)
model = word2vec.Word2Vec(sentences, size=300, min_count=30)

print("""model.most_similar("woman")""")
print(model.most_similar("woman"))
#[('child', 0.7057571411132812),
# ('girl', 0.702182412147522),
# ('man', 0.6846336126327515),
# ('herself', 0.6292711496353149),
# ('lady', 0.6229539513587952),
# ('person', 0.6190367937088013),
# ('lover', 0.6062309741973877),
# ('baby', 0.5993420481681824),
# ('mother', 0.5954475402832031),
# ('daughter', 0.5871444940567017)]
 
print("""model.most_similar(positive=["woman", "king"], negative=["man"], topn=10)""")
print(model.most_similar(positive=['woman', 'king'], 
                         negative=['man'], 
                         topn=10))
#[('queen', 0.6237582564353943),
# ('prince', 0.5638638734817505),
# ('elizabeth', 0.5557916164398193),
# ('princess', 0.5456407070159912),
# ('throne', 0.5439794063568115),
# ('daughter', 0.5364126563072205),
# ('empress', 0.5354889631271362),
# ('isabella', 0.5233952403068542),
# ('regent', 0.520746111869812),
# ('matilda', 0.5167444944381714)]                         
                         
print("""model.similarity("girl", "woman")""")
print(model.similarity("girl", "woman"))
print("""model.similarity("girl", "man")""")
print(model.similarity("girl", "man"))
print("""model.similarity("girl", "car")""")
print(model.similarity("girl", "car"))
print("""model.similarity("bus", "car")""")
print(model.similarity("bus", "car"))
#model.similarity("girl", "woman")
#0.702182479574
#model.similarity("girl", "man")
#0.574259909834
#model.similarity("girl", "car")
#0.289332921793
#model.similarity("bus", "car")
#0.483853497748

2019-03-11 10:55:26,986 : INFO : collecting all words and their counts


TypeError: ignored

In [0]:
# dataset  Kaggle UMICH SI650 - Sentiment Classification 
#   https://www.kaggle.com/c/si650winter11 

In [0]:



from keras.layers.core import Dense, Dropout, SpatialDropout1D
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np

np.random.seed(42)

INPUT_FILE = "../data/umich-sentiment-train.txt"
VOCAB_SIZE = 5000
EMBED_SIZE = 100
NUM_FILTERS = 256
NUM_WORDS = 3
BATCH_SIZE = 64
NUM_EPOCHS = 20

counter = collections.Counter()
fin = open(INPUT_FILE, "rb")
maxlen = 0
for line in fin:
    _, sent = line.strip().split("\t")
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        counter[word] += 1
fin.close()

word2index = collections.defaultdict(int)
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
    word2index[word[0]] = wid + 1
vocab_sz = len(word2index) + 1
index2word = {v:k for k, v in word2index.items()}
    
xs, ys = [], []
fin = open(INPUT_FILE, "rb")
for line in fin:
    label, sent = line.strip().split("\t")
    ys.append(int(label))
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    wids = [word2index[word] for word in words]
    xs.append(wids)
fin.close()
X = pad_sequences(xs, maxlen=maxlen)
Y = np_utils.to_categorical(ys)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, 
                                                random_state=42)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

model = Sequential()
model.add(Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen))
model.add(SpatialDropout1D(Dropout(0.2)))
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy",
              metrics=["accuracy"])
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(Xtest, Ytest))              

# plot loss function
plt.subplot(211)
plt.title("accuracy")
plt.plot(history.history["acc"], color="r", label="train")
plt.plot(history.history["val_acc"], color="b", label="validation")
plt.legend(loc="best")

plt.subplot(212)
plt.title("loss")
plt.plot(history.history["loss"], color="r", label="train")
plt.plot(history.history["val_loss"], color="b", label="validation")
plt.legend(loc="best")

plt.tight_layout()
plt.show()

# evaluate model
score = model.evaluate(Xtest, Ytest, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))