<a href="https://colab.research.google.com/github/placibo/udemy-natural-language-processing-with-deep-learning-in-python/blob/master/Section_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import print_function,division
from future.utils import iteritems
from builtins import range

In [0]:
from gensim.models import KeyedVectors

In [0]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [0]:
word_vectors = KeyedVectors.load_word2vec_format(
    './GoogleNews-vectors-negative300.bin.gz',binary=True
)

In [0]:
def find_analogies(w1,w2,w3):
  r = word_vectors.most_similar(positive=[w1,w3],negative=[w2])
  print("%s - %s = %s - %s" % (w1,w2,r[0][0],w3))

In [0]:
def nearest_neighbors(w):
  r = word_vectors.most_similar(positive=[w])
  print("neighbour of: %s" %w)
  for word, score in r:
    print("\t%s" % word)

In [0]:
find_analogies('king','man','woman')

In [0]:
nearest_neighbors('Harvard')

In [0]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

In [5]:
!wget https://www.cs.umb.edu/~smimarog/textmining/datasets/r8-train-all-terms.txt

--2019-08-28 15:52:08--  https://www.cs.umb.edu/~smimarog/textmining/datasets/r8-train-all-terms.txt
Resolving www.cs.umb.edu (www.cs.umb.edu)... 158.121.106.224
Connecting to www.cs.umb.edu (www.cs.umb.edu)|158.121.106.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3354424 (3.2M) [text/plain]
Saving to: ‘r8-train-all-terms.txt’


2019-08-28 15:52:10 (4.09 MB/s) - ‘r8-train-all-terms.txt’ saved [3354424/3354424]



In [6]:
!wget https://www.cs.umb.edu/~smimarog/textmining/datasets/r8-test-all-terms.txt

--2019-08-28 15:52:13--  https://www.cs.umb.edu/~smimarog/textmining/datasets/r8-test-all-terms.txt
Resolving www.cs.umb.edu (www.cs.umb.edu)... 158.121.106.224
Connecting to www.cs.umb.edu (www.cs.umb.edu)|158.121.106.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1195261 (1.1M) [text/plain]
Saving to: ‘r8-test-all-terms.txt’


2019-08-28 15:52:14 (1.94 MB/s) - ‘r8-test-all-terms.txt’ saved [1195261/1195261]



In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [8]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
test = pd.read_csv('./r8-test-all-terms.txt',header=None,sep='\t')
train = pd.read_csv('r8-train-all-terms.txt',header=None,sep='\t')
train.columns = ['label','content']
test.columns = ['label','content']

In [0]:
class GloveVectorizer:
  def __init__(self):
    print('Loading word vectors')
    word2vec = {}
    embedding = []
    idx2word = []
    with open('glove.6B.50d.txt') as f:
      for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:],dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
    print('Found %s word vectors.' % len(word2vec))
    
    self.word2vec = word2vec
    self.embedding = np.array(embedding)
    self.word2idx = {v:k for k,v in enumerate(idx2word)}
    self.V, self.D = self.embedding.shape
   
  def fit(self,data):
    pass
  
  def transform(self,data):
    X = np.zeros((len(data),self.D))
    n=0
    emptycount = 0
    for sentence in data:
      tokens = sentence.lower().split()
      vecs = []
      for word in tokens:
        if word in self.word2vec:
          vec = self.word2vec[word]
          vecs.append(vec)
      
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Number of samples with no words found: %s / %s" % (emptycount,len(data)))
    return X
  
  def fit_transform(self,data):
    self.fit(data)
    return self.transform(data)

In [0]:
class Word2VecVectorizer:
  def __init__(self):
    print('Loading in word vectors...')
    self.word_vectors = KeyedVectors.load_word2vec_format(
    './GoogleNews-vectors-negative300.bin.gz',binary=True
    )
    print("Finished loading in word vectors")
    
  def fit(self,data):
    pass
  
  def transform(self,data):
    v = self.word_vectors.get_vector('king')
    self.D = v.shape[0]
    
    X = np.zeros((len(data),self.D))
    n=0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m=0
      for word in tokens:
        try:
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Number of samples with no word found %s / %s" % (emptycount,len(data)))
    return X
  
  def fit_transform(self,data):
    self.fit(data)
    return self.transform(data)

In [0]:
vectorizer = Word2VecVectorizer()
Xtrain = vectorizer.fit_transform(train.content)
Ytrain = train.label

In [0]:
Xtest = vectorizer.fit_transform(test.content)
Ytest = test.label

In [14]:
model = RandomForestClassifier(n_estimators=200)
model.fit(Xtrain,Ytrain)
print("train score: ",model.score(Xtrain,Ytrain))
print("test score: ",model.score(Xtest,Ytest))

train score:  0.9992707383773929
test score:  0.9392416628597533


In [15]:
vectorizer = GloveVectorizer()
Xtrain = vectorizer.fit_transform(train.content)
Ytrain = train.label

Loading word vectors
Found 400000 word vectors.
Number of samples with no words found: 0 / 5485


In [16]:
Xtest = vectorizer.fit_transform(test.content)
Ytest = test.label

Number of samples with no words found: 0 / 2189


In [17]:
model = RandomForestClassifier(n_estimators=200)
model.fit(Xtrain,Ytrain)
print("train score: ",model.score(Xtrain,Ytrain))
print("test score: ",model.score(Xtest,Ytest))

train score:  0.9992707383773929
test score:  0.9346733668341709
