In [2]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from util import get_wikipedia_data
from nlp_class2.util import find_analogies
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.manifold import TSNE

# Find and Assess Word Vectors using TF-IDF and t-SNE

In [33]:
def find_analogies_TFIDF_tSNE():
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    sentences, word2idx = get_wikipedia_data(
        n_files=3, n_vocab=2000, by_paragraph=True)

    notfound = False
    for word_list in analogies_to_try:
        for word in word_list:
            if word not in word2idx:
                notfound = True
    if notfound:
        exit()

    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    print("V:", V, "N:", N)

    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i, j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A.T).T
    A = A.toarray()

    idx2word = {v: k for k, v in word2idx.items()}

    tsne = TSNE()
    Z = tsne.fit_transform(A)

    tsne2 = TSNE(n_components=3)
    We = tsne.fit_transform(A)

    plt.scatter(Z[:, 0], Z[:, 1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode(
                'utf-8').decode('utf-8'), xy=(Z[i, 0], Z[i, 1]))
        except:
            pass
    plt.draw()


    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx, idx2word)
    plt.show()

# Pre-Trained Word Vectors from GloVe

In [3]:
from sklearn.metrics.pairwise import pairwise_distances

In [5]:
print('Loading word vectors...')
word2vec = {}
embedding = []
idx2word = []
with open('./large_files/glove.6B/glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.array(values[1:], dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
print('Found %s word vectors.' % len(word2vec))
embedding = np.array(embedding)
V, D = embedding.shape

Loading word vectors...
Found 400000 word vectors.


In [6]:
V, D, len(idx2word)

(400000, 50, 400000)

In [9]:
def find_analogies_GloVe(w1,w2,w3):
    for w in (w1, w2, w3):
        if w not in word2vec:
            print("{} not in word2vec".format(w))

    king = word2vec[w1]
    man = word2vec[w2]
    woman = word2vec[w3]
    v0 = king-man+woman
    distances = pairwise_distances(v0.reshape(1,D), embedding, metric='cosine')
    distances = distances.reshape(V)
    idxs = distances.argsort()[:4]
    for idx in idxs:
        word = idx2word[idx]
        if  word not in (w1,w2,w3):
            best_word = word
            break
    print(w1, "-", w2, "=", best_word, "-", w3)

In [10]:
find_analogies_GloVe('king', 'man', 'woman')
find_analogies_GloVe('france', 'paris', 'london')
find_analogies_GloVe('france', 'paris', 'rome')
find_analogies_GloVe('paris', 'france', 'italy')
find_analogies_GloVe('france', 'french', 'english')
find_analogies_GloVe('japan', 'japanese', 'chinese')
find_analogies_GloVe('japan', 'japanese', 'italian')
find_analogies_GloVe('japan', 'japanese', 'australian')
find_analogies_GloVe('december', 'november', 'june')
find_analogies_GloVe('miami', 'florida', 'texas')
find_analogies_GloVe('einstein', 'scientist', 'painter')
find_analogies_GloVe('china', 'rice', 'bread')
find_analogies_GloVe('man', 'woman', 'she')
find_analogies_GloVe('man', 'woman', 'aunt')
find_analogies_GloVe('man', 'woman', 'sister')
find_analogies_GloVe('man', 'woman', 'wife')
find_analogies_GloVe('man', 'woman', 'actress')
find_analogies_GloVe('man', 'woman', 'mother')
find_analogies_GloVe('heir', 'heiress', 'princess')
find_analogies_GloVe('nephew', 'niece', 'aunt')
find_analogies_GloVe('france', 'paris', 'tokyo')
find_analogies_GloVe('france', 'paris', 'beijing')
find_analogies_GloVe('february', 'january', 'november')
find_analogies_GloVe('france', 'paris', 'rome')
find_analogies_GloVe('paris', 'france', 'italy')


king - man = queen - woman
france - paris = britain - london
france - paris = italy - rome
paris - france = rome - italy
france - french = england - english
japan - japanese = china - chinese
japan - japanese = italy - italian
japan - japanese = australia - australian
december - november = july - june
miami - florida = houston - texas
einstein - scientist = matisse - painter
china - rice = chinese - bread
man - woman = he - she
man - woman = uncle - aunt
man - woman = brother - sister
man - woman = friend - wife
man - woman = actor - actress
man - woman = father - mother
heir - heiress = queen - princess
nephew - niece = uncle - aunt
france - paris = japan - tokyo
france - paris = china - beijing
february - january = october - november
france - paris = italy - rome
paris - france = rome - italy


In [17]:

def nearest_neighbors_GloVe(w, n=5):
    print(w)
    if w not in word2vec:
        print("{} not in word2vec".format(w))
    v=word2vec[w]
    distance = pairwise_distances(v.reshape(1,D),embedding,metric='cosine').reshape(V)
    idxs = distance.argsort()[1:n+1]
    for idx in idxs:
         print("\t%s" % idx2word[idx])
    print()

In [18]:
nearest_neighbors_GloVe('king')
nearest_neighbors_GloVe('france')
nearest_neighbors_GloVe('japan')
nearest_neighbors_GloVe('einstein')
nearest_neighbors_GloVe('woman')
nearest_neighbors_GloVe('nephew')
nearest_neighbors_GloVe('february')
nearest_neighbors_GloVe('rome')

king
	prince
	queen
	ii
	emperor
	son

france
	french
	belgium
	paris
	spain
	netherlands

japan
	japanese
	china
	korea
	tokyo
	taiwan

einstein
	relativity
	bohr
	physics
	heisenberg
	freud

woman
	girl
	man
	mother
	her
	boy

nephew
	cousin
	brother
	grandson
	son
	uncle

february
	october
	december
	january
	august
	september

rome
	naples
	venice
	italy
	turin
	pope



# Pre-Trained Word Vectors from Word2vec

In [19]:
from gensim.models import KeyedVectors

In [21]:
word_vectors=KeyedVectors.load_word2vec_format('./large_files/GoogleNews-vectors-negative300.bin',binary=True)

In [22]:
def find_analogies_word2vec(w1,w2,w3):
    r = word_vectors.most_similar(positive=[w1,w3],negative=[w2])
    print("%s - %s = %s - %s" % (w1, w2, r[0][0], w3))

In [23]:
find_analogies_word2vec('king', 'man', 'woman')
find_analogies_word2vec('france', 'paris', 'london')
find_analogies_word2vec('france', 'paris', 'rome')
find_analogies_word2vec('paris', 'france', 'italy')
find_analogies_word2vec('france', 'french', 'english')
find_analogies_word2vec('japan', 'japanese', 'chinese')
find_analogies_word2vec('japan', 'japanese', 'italian')
find_analogies_word2vec('japan', 'japanese', 'australian')
find_analogies_word2vec('december', 'november', 'june')
find_analogies_word2vec('miami', 'florida', 'texas')
find_analogies_word2vec('einstein', 'scientist', 'painter')
find_analogies_word2vec('china', 'rice', 'bread')
find_analogies_word2vec('man', 'woman', 'she')
find_analogies_word2vec('man', 'woman', 'aunt')
find_analogies_word2vec('man', 'woman', 'sister')
find_analogies_word2vec('man', 'woman', 'wife')
find_analogies_word2vec('man', 'woman', 'actress')
find_analogies_word2vec('man', 'woman', 'mother')
find_analogies_word2vec('heir', 'heiress', 'princess')
find_analogies_word2vec('nephew', 'niece', 'aunt')
find_analogies_word2vec('france', 'paris', 'tokyo')
find_analogies_word2vec('france', 'paris', 'beijing')
find_analogies_word2vec('february', 'january', 'november')
find_analogies_word2vec('france', 'paris', 'rome')
find_analogies_word2vec('paris', 'france', 'italy')

king - man = queen - woman
france - paris = england - london
france - paris = italy - rome
paris - france = lohan - italy
france - french = england - english
japan - japanese = tibet - chinese
japan - japanese = italy - italian
japan - japanese = queensland - australian
december - november = september - june
miami - florida = dallas - texas
einstein - scientist = jude - painter
china - rice = dinnerware - bread
man - woman = he - she
man - woman = uncle - aunt
man - woman = brother - sister
man - woman = son - wife
man - woman = actor - actress
man - woman = father - mother
heir - heiress = prince - princess
nephew - niece = uncle - aunt
france - paris = japan - tokyo
france - paris = chinese - beijing
february - january = april - november
france - paris = italy - rome
paris - france = lohan - italy


In [32]:
def nearest_neighbors_word2vec(w,n=5):
    print(w)
    r = word_vectors.most_similar(positive=[w])
    count=0
    for word,score in r:
        if count<=5:
            print("\t%s" % word)
            count+=1
        else:
            break

In [33]:
nearest_neighbors_word2vec('king')
nearest_neighbors_word2vec('france')
nearest_neighbors_word2vec('japan')
nearest_neighbors_word2vec('einstein')
nearest_neighbors_word2vec('woman')
nearest_neighbors_word2vec('nephew')
nearest_neighbors_word2vec('february')
nearest_neighbors_word2vec('rome')

king
	kings
	queen
	monarch
	crown_prince
	prince
	sultan
france
	spain
	french
	germany
	europe
	italy
	england
japan
	japanese
	tokyo
	america
	europe
	germany
	chinese
einstein
	nikki
	lmfao
	albert
	armstrong
	joan
	becky
woman
	man
	girl
	teenage_girl
	teenager
	lady
	teenaged_girl
nephew
	son
	uncle
	brother
	grandson
	cousin
	father
february
	january
	april
	september
	december
	july
	october
rome
	athens
	albert
	holmes
	donnie
	italy
	toni


# Text Classification

In [41]:
import pandas as pd
train=pd.read_csv('./large_files/r8-train-all-terms.txt',sep='\t',header=None)
test=pd.read_csv('./large_files/r8-test-all-terms.txt',sep='\t',header=None)
train.columns=['label','content']
test.columns=['label','content']

In [42]:
train.head()

Unnamed: 0,label,content
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [53]:
class GloveVectorizer(object):
    def __init__(self):
        print('Loading word vectors...')
        self.word2vec = {}
        self.embedding = []
        self.idx2word = []
        with open('./large_files/glove.6B/glove.6B.50d.txt', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.array(values[1:], dtype='float32')
                self.word2vec[word] = vec
                self.embedding.append(vec)
                self.idx2word.append(word)
        print('Found %s word vectors.' % len(self.word2vec))
        self.embedding = np.array(self.embedding)
        self.V, self.D = self.embedding.shape
        self.word2idx={v:k for k,v in enumerate(idx2word)}
        
    def fit(self,data):
        pass
    
    def transform(self,data):
        X=np.zeros((len(data),self.D))
        n=0
        emptycount=0
        for sentence in data:
            tokens=sentence.lower().split()
            vecs=[]
            for word in tokens:
                if word in self.word2vec:
                    vec=self.word2vec[word]
                    vecs.append(vec)
            if len(vecs)>0:
                vecs=np.array(vecs)
                X[n]=vecs.mean(axis=0)
            else:
                emptycount+=1
            n+=1
        print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X
        
    def fit_transform(self,data):
        self.fit(data)
        return self.transform(data)

In [54]:
class Word2vecVectorizer(object):
    def __init__(object):
        print("Loading in word vectors...")
        self.word_vectors=KeyedVectors.load_word2vec_format('./large_files/GoogleNews-vectors-negative300.bin',binary=True)
        print("Finished loading in word vectors")
        
    def fit(self,data):
        pass
    
    def transform(self,data):
        v=self.word_vectors.get_vector('king')
        self.D=v.shape[0]
        X=np.zeros((len(data),self.D))
        n=0
        emptycount=0
        for sentence in data:
            tokens=sentence.lowe().split()
            vecs=[]
            for word in tokens:
                if word in self.word_vectors:
                    vec = self.word_vectors.get_vector(word)
                    vecs.append(vec)
            if len(vecs)>0:
                X[n]=np.array(vecs).mean(axis=0)
            else:
                emptycount+=1
            n+=1
            print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X
        
    def fit_transform(self,data):
        self.fit(data)
        return self.transform(data)

In [55]:
vectorizer_glove=GloveVectorizer()
X_train = vectorizer_glove.fit_transform(train.content)
y_train = train.label
X_test = vectorizer_glove.fit_transform(test.content)
y_test = test.label

Loading word vectors...
Found 400000 word vectors.
Numer of samples with no words found: 0 / 5485
Numer of samples with no words found: 0 / 2189


In [85]:
vectorizer_w2v=GloveVectorizer()
X_train2 = vectorizer_w2v.fit_transform(train.content)
y_train2 = train.label
X_test2 = vectorizer_w2v.fit_transform(test.content)
y_test2 = test.label

Loading word vectors...
Found 400000 word vectors.
Numer of samples with no words found: 0 / 5485
Numer of samples with no words found: 0 / 2189


In [81]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier

In [83]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train,y_train)
print("train score: ",rf.score(X_train,y_train))
print("test score: ",rf.score(X_test,y_test))

train score:  0.9992707383773929
test score:  0.9328460484239379


In [86]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train2,y_train2)
print("train score: ",rf.score(X_train2,y_train2))
print("test score: ",rf.score(X_test2,y_test2))

train score:  0.9992707383773929
test score:  0.9328460484239379


In [84]:
et = ExtraTreesClassifier(n_estimators=200)
et.fit(X_train,y_train)
print("train score: ",et.score(X_train,y_train))
print("test score: ",et.score(X_test,y_test))

train score:  0.9992707383773929
test score:  0.9378711740520785


In [87]:
et = ExtraTreesClassifier(n_estimators=200)
et.fit(X_train2,y_train2)
print("train score: ",et.score(X_train2,y_train2))
print("test score: ",et.score(X_test2,y_test2))

train score:  0.9992707383773929
test score:  0.9337597076290544


# 