In [1]:
from __future__ import unicode_literals
from __future__ import print_function

import gensim
import logging
import numpy as np

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from gensim.models.word2vec import Text8Corpus, Word2Vec

In [3]:
def print_accuracy(model, questions_file, num_bits=0):
    print('Evaluating...\n')
    orig_vectors = np.copy(model.wv.vectors)
    model.wv.quantize_vectors(num_bits=num_bits)
    model.init_sims(replace=True)
    
    acc = model.accuracy(questions_file)

    sem_correct = sum((len(acc[i]['correct']) for i in range(5)))
    sem_total = sum((len(acc[i]['correct']) + len(acc[i]['incorrect'])) for i in range(5))
    sem_acc = 100*float(sem_correct)/sem_total
    print('\nSemantic: {:d}/{:d}, Accuracy: {:.2f}%'.format(sem_correct, sem_total, sem_acc))
    
    syn_correct = sum((len(acc[i]['correct']) for i in range(5, len(acc)-1)))
    syn_total = sum((len(acc[i]['correct']) + len(acc[i]['incorrect'])) for i in range(5,len(acc)-1))
    syn_acc = 100*float(syn_correct)/syn_total
    print('Syntactic: {:d}/{:d}, Accuracy: {:.2f}%\n'.format(syn_correct, syn_total, syn_acc))
    
    model.wv.vectors = orig_vectors
    model.init_sims(replace=True)


In [4]:
corpus = Text8Corpus('./text8')

# No quantizing

In [5]:
model = Word2Vec(
    sentences=corpus,
    iter=25,
    sg=0,  # CBOW
    size=400,
    alpha=0.05,
    min_alpha=0.0001,
    window=10,
    min_count=5,
    negative=12,
    sample=1e-4,
    num_bits=0
)

2018-03-31 18:43:45,411 : INFO : collecting all words and their counts
2018-03-31 18:43:45,415 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-31 18:43:59,508 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2018-03-31 18:43:59,509 : INFO : Loading a fresh vocabulary
2018-03-31 18:44:00,081 : INFO : effective_min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)
2018-03-31 18:44:00,082 : INFO : effective_min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2018-03-31 18:44:00,412 : INFO : deleting the raw counts dictionary of 253854 items
2018-03-31 18:44:00,449 : INFO : sample=0.0001 downsamples 341 most-common words
2018-03-31 18:44:00,451 : INFO : downsampling leaves estimated 9386181 word corpus (56.1% of prior 16718844)
2018-03-31 18:44:00,857 : INFO : estimated required memory for 71290 words and 128 dimensions: 108645960 bytes
2018-03-31 18:44:00,858 

In [6]:
print_accuracy(model, './datasets/questions-words.txt', num_bits=0)

2018-03-31 18:49:18,774 : INFO : precomputing L2-norms of word weight vectors


Evaluating...



2018-03-31 18:49:21,489 : INFO : capital-common-countries: 75.7% (383/506)
2018-03-31 18:49:26,994 : INFO : capital-world: 53.4% (775/1452)
2018-03-31 18:49:28,010 : INFO : currency: 20.5% (55/268)
2018-03-31 18:49:34,064 : INFO : city-in-state: 39.3% (618/1571)
2018-03-31 18:49:35,233 : INFO : family: 71.6% (219/306)
2018-03-31 18:49:38,119 : INFO : gram1-adjective-to-adverb: 17.1% (129/756)
2018-03-31 18:49:39,265 : INFO : gram2-opposite: 19.6% (60/306)
2018-03-31 18:49:44,015 : INFO : gram3-comparative: 58.0% (731/1260)
2018-03-31 18:49:45,939 : INFO : gram4-superlative: 36.8% (186/506)
2018-03-31 18:49:49,673 : INFO : gram5-present-participle: 32.1% (318/992)
2018-03-31 18:49:54,834 : INFO : gram6-nationality-adjective: 84.2% (1154/1371)
2018-03-31 18:49:59,854 : INFO : gram7-past-tense: 33.4% (445/1332)
2018-03-31 18:50:03,576 : INFO : gram8-plural: 59.4% (589/992)
2018-03-31 18:50:06,076 : INFO : gram9-plural-verbs: 34.6% (225/650)
2018-03-31 18:50:06,078 : INFO : total: 48.0% (5


Semantic: 2050/4103, Accuracy: 49.96%
Syntactic: 3837/8165, Accuracy: 46.99%



### Quantize original vectors to 1 bit and measure accuracy (should drop)

In [7]:
print_accuracy(model, './datasets/questions-words.txt', num_bits=1)

2018-03-31 18:50:06,972 : INFO : precomputing L2-norms of word weight vectors


Evaluating...



2018-03-31 18:50:09,306 : INFO : capital-common-countries: 26.3% (133/506)
2018-03-31 18:50:14,036 : INFO : capital-world: 14.3% (207/1452)
2018-03-31 18:50:14,893 : INFO : currency: 0.4% (1/268)
2018-03-31 18:50:19,829 : INFO : city-in-state: 8.1% (128/1571)
2018-03-31 18:50:20,784 : INFO : family: 21.9% (67/306)
2018-03-31 18:50:23,172 : INFO : gram1-adjective-to-adverb: 4.5% (34/756)
2018-03-31 18:50:24,140 : INFO : gram2-opposite: 5.6% (17/306)
2018-03-31 18:50:28,122 : INFO : gram3-comparative: 23.3% (294/1260)
2018-03-31 18:50:29,723 : INFO : gram4-superlative: 6.9% (35/506)
2018-03-31 18:50:32,819 : INFO : gram5-present-participle: 10.3% (102/992)
2018-03-31 18:50:37,117 : INFO : gram6-nationality-adjective: 33.5% (459/1371)
2018-03-31 18:50:41,312 : INFO : gram7-past-tense: 9.1% (121/1332)
2018-03-31 18:50:44,426 : INFO : gram8-plural: 12.0% (119/992)
2018-03-31 18:50:46,452 : INFO : gram9-plural-verbs: 8.8% (57/650)
2018-03-31 18:50:46,453 : INFO : total: 14.5% (1774/12268)
20


Semantic: 536/4103, Accuracy: 13.06%
Syntactic: 1238/8165, Accuracy: 15.16%



### Quantize original vectors to 2 bits and measure accuracy (should drop)

In [8]:
print_accuracy(model, './datasets/questions-words.txt', num_bits=2)

Evaluating...



2018-03-31 18:50:47,407 : INFO : precomputing L2-norms of word weight vectors
2018-03-31 18:50:49,718 : INFO : capital-common-countries: 26.9% (136/506)
2018-03-31 18:50:54,208 : INFO : capital-world: 14.4% (209/1452)
2018-03-31 18:50:55,045 : INFO : currency: 0.4% (1/268)
2018-03-31 18:50:59,966 : INFO : city-in-state: 8.2% (129/1571)
2018-03-31 18:51:00,910 : INFO : family: 23.2% (71/306)
2018-03-31 18:51:03,237 : INFO : gram1-adjective-to-adverb: 4.9% (37/756)
2018-03-31 18:51:04,194 : INFO : gram2-opposite: 5.6% (17/306)
2018-03-31 18:51:08,115 : INFO : gram3-comparative: 22.7% (286/1260)
2018-03-31 18:51:09,674 : INFO : gram4-superlative: 6.5% (33/506)
2018-03-31 18:51:12,738 : INFO : gram5-present-participle: 10.4% (103/992)
2018-03-31 18:51:16,969 : INFO : gram6-nationality-adjective: 33.4% (458/1371)
2018-03-31 18:51:21,083 : INFO : gram7-past-tense: 8.9% (118/1332)
2018-03-31 18:51:24,129 : INFO : gram8-plural: 12.2% (121/992)
2018-03-31 18:51:26,157 : INFO : gram9-plural-verb


Semantic: 546/4103, Accuracy: 13.31%
Syntactic: 1228/8165, Accuracy: 15.04%



# Quantize with 1 bit

In [9]:
model = Word2Vec(
    sentences=corpus,
    iter=10,
    sg=0,  # CBOW
    size=128,
    alpha=0.05,
    min_alpha=0.0001,
    window=10,
    min_count=5,
    negative=12,
    sample=1e-4,
    num_bits=1
)

2018-03-31 18:51:26,861 : INFO : collecting all words and their counts
2018-03-31 18:51:26,867 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-31 18:51:38,752 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2018-03-31 18:51:38,753 : INFO : Loading a fresh vocabulary
2018-03-31 18:51:39,370 : INFO : effective_min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)
2018-03-31 18:51:39,371 : INFO : effective_min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2018-03-31 18:51:39,642 : INFO : deleting the raw counts dictionary of 253854 items
2018-03-31 18:51:39,681 : INFO : sample=0.0001 downsamples 341 most-common words
2018-03-31 18:51:39,682 : INFO : downsampling leaves estimated 9386181 word corpus (56.1% of prior 16718844)
2018-03-31 18:51:40,028 : INFO : estimated required memory for 71290 words and 128 dimensions: 108645960 bytes
2018-03-31 18:51:40,031 

#### Quantized loss, no quantized vectors accuracy

In [10]:
print_accuracy(model, './datasets/questions-words.txt', num_bits=0)

2018-03-31 18:56:34,814 : INFO : precomputing L2-norms of word weight vectors


Evaluating...



2018-03-31 18:56:37,794 : INFO : capital-common-countries: 49.0% (248/506)
2018-03-31 18:56:44,756 : INFO : capital-world: 36.8% (535/1452)
2018-03-31 18:56:46,912 : INFO : currency: 12.3% (33/268)
2018-03-31 18:56:54,735 : INFO : city-in-state: 17.3% (272/1571)
2018-03-31 18:56:56,577 : INFO : family: 40.2% (123/306)
2018-03-31 18:57:00,330 : INFO : gram1-adjective-to-adverb: 13.0% (98/756)
2018-03-31 18:57:01,758 : INFO : gram2-opposite: 8.8% (27/306)
2018-03-31 18:57:06,842 : INFO : gram3-comparative: 32.5% (410/1260)
2018-03-31 18:57:09,008 : INFO : gram4-superlative: 18.0% (91/506)
2018-03-31 18:57:12,959 : INFO : gram5-present-participle: 24.2% (240/992)
2018-03-31 18:57:18,423 : INFO : gram6-nationality-adjective: 57.0% (781/1371)
2018-03-31 18:57:24,417 : INFO : gram7-past-tense: 19.4% (258/1332)
2018-03-31 18:57:29,219 : INFO : gram8-plural: 43.0% (427/992)
2018-03-31 18:57:32,819 : INFO : gram9-plural-verbs: 16.2% (105/650)
2018-03-31 18:57:32,821 : INFO : total: 29.7% (3648/


Semantic: 1211/4103, Accuracy: 29.51%
Syntactic: 2437/8165, Accuracy: 29.85%



#### Quantized loss, quantized 1bit vectors accuracy (should be ~ the same)

In [11]:
print_accuracy(model, './datasets/questions-words.txt', num_bits=1)

2018-03-31 18:57:33,933 : INFO : precomputing L2-norms of word weight vectors


Evaluating...



2018-03-31 18:57:36,642 : INFO : capital-common-countries: 31.0% (157/506)
2018-03-31 18:57:41,268 : INFO : capital-world: 22.4% (325/1452)
2018-03-31 18:57:42,162 : INFO : currency: 6.0% (16/268)
2018-03-31 18:57:47,319 : INFO : city-in-state: 15.3% (240/1571)
2018-03-31 18:57:48,275 : INFO : family: 16.0% (49/306)
2018-03-31 18:57:50,755 : INFO : gram1-adjective-to-adverb: 5.7% (43/756)
2018-03-31 18:57:51,767 : INFO : gram2-opposite: 6.2% (19/306)
2018-03-31 18:57:56,446 : INFO : gram3-comparative: 14.8% (186/1260)
2018-03-31 18:57:58,028 : INFO : gram4-superlative: 3.8% (19/506)
2018-03-31 18:58:01,183 : INFO : gram5-present-participle: 10.4% (103/992)
2018-03-31 18:58:05,425 : INFO : gram6-nationality-adjective: 26.6% (365/1371)
2018-03-31 18:58:10,199 : INFO : gram7-past-tense: 6.2% (83/1332)
2018-03-31 18:58:14,084 : INFO : gram8-plural: 19.5% (193/992)
2018-03-31 18:58:16,208 : INFO : gram9-plural-verbs: 11.2% (73/650)
2018-03-31 18:58:16,213 : INFO : total: 15.3% (1871/12268)



Semantic: 787/4103, Accuracy: 19.18%
Syntactic: 1084/8165, Accuracy: 13.28%



# Quantize with 2 bits

In [12]:
model = Word2Vec(
    sentences=corpus,
    iter=10,
    sg=0,  # CBOW
    size=128,
    alpha=0.05,
    min_alpha=0.0001,
    window=10,
    min_count=5,
    negative=12,
    sample=1e-4,
    num_bits=2
)

2018-03-31 18:58:17,093 : INFO : collecting all words and their counts
2018-03-31 18:58:17,097 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-31 18:58:35,025 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2018-03-31 18:58:35,029 : INFO : Loading a fresh vocabulary
2018-03-31 18:58:36,019 : INFO : effective_min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)
2018-03-31 18:58:36,020 : INFO : effective_min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2018-03-31 18:58:36,346 : INFO : deleting the raw counts dictionary of 253854 items
2018-03-31 18:58:36,424 : INFO : sample=0.0001 downsamples 341 most-common words
2018-03-31 18:58:36,428 : INFO : downsampling leaves estimated 9386181 word corpus (56.1% of prior 16718844)
2018-03-31 18:58:36,923 : INFO : estimated required memory for 71290 words and 128 dimensions: 108645960 bytes
2018-03-31 18:58:36,925 

#### Quantized 2bits loss, not quantized vectors accuracy

In [13]:
print_accuracy(model, './datasets/questions-words.txt', num_bits=0)

2018-03-31 19:05:47,216 : INFO : precomputing L2-norms of word weight vectors


Evaluating...



2018-03-31 19:05:51,222 : INFO : capital-common-countries: 56.1% (284/506)
2018-03-31 19:05:58,197 : INFO : capital-world: 53.0% (770/1452)
2018-03-31 19:06:00,030 : INFO : currency: 19.4% (52/268)
2018-03-31 19:06:07,086 : INFO : city-in-state: 33.7% (530/1571)
2018-03-31 19:06:08,353 : INFO : family: 60.5% (185/306)
2018-03-31 19:06:11,768 : INFO : gram1-adjective-to-adverb: 11.6% (88/756)
2018-03-31 19:06:13,166 : INFO : gram2-opposite: 10.1% (31/306)
2018-03-31 19:06:19,133 : INFO : gram3-comparative: 41.5% (523/1260)
2018-03-31 19:06:21,651 : INFO : gram4-superlative: 29.1% (147/506)
2018-03-31 19:06:26,332 : INFO : gram5-present-participle: 23.1% (229/992)
2018-03-31 19:06:34,839 : INFO : gram6-nationality-adjective: 67.3% (923/1371)
2018-03-31 19:06:40,370 : INFO : gram7-past-tense: 27.2% (362/1332)
2018-03-31 19:06:46,014 : INFO : gram8-plural: 51.6% (512/992)
2018-03-31 19:06:49,909 : INFO : gram9-plural-verbs: 24.9% (162/650)
2018-03-31 19:06:49,911 : INFO : total: 39.1% (479


Semantic: 1821/4103, Accuracy: 44.38%
Syntactic: 2977/8165, Accuracy: 36.46%



#### Quantized 2bits loss, quantized 2bits vectors accuracy

In [14]:
print_accuracy(model, './datasets/questions-words.txt', num_bits=2)

Evaluating...



2018-03-31 19:06:51,118 : INFO : precomputing L2-norms of word weight vectors
2018-03-31 19:06:55,350 : INFO : capital-common-countries: 24.9% (126/506)
2018-03-31 19:07:00,970 : INFO : capital-world: 18.9% (275/1452)
2018-03-31 19:07:02,711 : INFO : currency: 5.6% (15/268)
2018-03-31 19:07:09,411 : INFO : city-in-state: 16.1% (253/1571)
2018-03-31 19:07:10,489 : INFO : family: 33.3% (102/306)
2018-03-31 19:07:13,694 : INFO : gram1-adjective-to-adverb: 6.9% (52/756)
2018-03-31 19:07:15,159 : INFO : gram2-opposite: 5.2% (16/306)
2018-03-31 19:07:19,199 : INFO : gram3-comparative: 27.1% (341/1260)
2018-03-31 19:07:20,804 : INFO : gram4-superlative: 16.4% (83/506)
2018-03-31 19:07:24,911 : INFO : gram5-present-participle: 13.6% (135/992)
2018-03-31 19:07:30,012 : INFO : gram6-nationality-adjective: 47.3% (649/1371)
2018-03-31 19:07:35,133 : INFO : gram7-past-tense: 14.8% (197/1332)
2018-03-31 19:07:38,612 : INFO : gram8-plural: 24.8% (246/992)
2018-03-31 19:07:40,649 : INFO : gram9-plural


Semantic: 771/4103, Accuracy: 18.79%
Syntactic: 1757/8165, Accuracy: 21.52%

