piskvorky · menshikh-iv · May 25, 2017 · May 13, 2017 · May 19, 2017 · May 19, 2017
diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py
@@ -140,6 +140,7 @@ class FastText(Word2Vec):
 
     def initialize_word_vectors(self):
         self.wv = FastTextKeyedVectors()
+        self.new_format = False
 
     @classmethod
     def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5,
@@ -256,7 +257,14 @@ def load_binary_data(self, model_binary_file):
             self.load_vectors(f)
 
     def load_model_params(self, file_handle):
-        (dim, ws, epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t) = self.struct_unpack(file_handle, '@12i1d')
+        magic, v= self.struct_unpack(file_handle, '@2i')
+        if magic == 793712314:  # newer format 
+            self.new_format = True
+            dim, ws, epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@12i1d')
+        else:  # older format
+            dim = magic
+            ws = v
+            epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d')
         # Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
         self.size = dim
         self.window = ws
@@ -270,26 +278,34 @@ def load_model_params(self, file_handle):
         self.wv.max_n = maxn
         self.sample = t
 
-    def load_dict(self, file_handle):
-        (vocab_size, nwords, _) = self.struct_unpack(file_handle, '@3i')
+    def load_dict(self, file_handle, encoding='utf8'):
+        vocab_size, nwords, _ = self.struct_unpack(file_handle, '@3i')
         # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
         assert len(self.wv.vocab) == nwords, 'mismatch between vocab sizes'
-        assert len(self.wv.vocab) == vocab_size, 'mismatch between vocab sizes'
-        ntokens, = self.struct_unpack(file_handle, '@q')
+        if len(self.wv.vocab) != vocab_size:
+            logger.warnings("If you are loading any model other than pretrained vector wiki.fr, ")
+            logger.warnings("Please report to gensim or fastText.")
+        ntokens= self.struct_unpack(file_handle, '@1q')
+        if self.new_format:
+            pruneidx_size = self.struct_unpack(file_handle, '@q')
         for i in range(nwords):
             word_bytes = b''
             char_byte = file_handle.read(1)
             # Read vocab word
             while char_byte != b'\x00':
                 word_bytes += char_byte
                 char_byte = file_handle.read(1)
-            word = word_bytes.decode('utf8')
-            count, _ = self.struct_unpack(file_handle, '@ib')
-            _ = self.struct_unpack(file_handle, '@i')
-            assert self.wv.vocab[word].index == i, 'mismatch between gensim word index and fastText word index'
-            self.wv.vocab[word].count = count
+            word = word_bytes.decode(encoding)
+            count, _ = self.struct_unpack(file_handle, '@qb')
+            if word in self.wv.vocab:
+                # skip loading info about words in bin file which are not present in vec file
+                # handling mismatch in vocab_size in vec and bin files (ref: wiki.fr)
+                assert self.wv.vocab[word].index == i, 'mismatch between gensim word index and fastText word index'
+                self.wv.vocab[word].count = count
 
     def load_vectors(self, file_handle):
+        if self.new_format:
+            _ = self.struct_unpack(file_handle,'@?')
         num_vectors, dim = self.struct_unpack(file_handle, '@2q')
         # Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
         assert self.size == dim, 'mismatch between model sizes'

diff --git a/gensim/test/test_data/lee_fasttext_new.bin b/gensim/test/test_data/lee_fasttext_new.bin