microsoft · ljshou · May 10, 2019 · May 8, 2019 · May 8, 2019 · May 8, 2019
diff --git a/ModelConf.py b/ModelConf.py
@@ -13,7 +13,7 @@
 
 from losses.BaseLossConf import BaseLossConf
 #import traceback
-from settings import ProblemTypes, TaggingSchemes, SupportedMetrics, PredictionTypes, DefaultPredictionFields
+from settings import LanguageTypes, ProblemTypes, TaggingSchemes, SupportedMetrics, PredictionTypes, DefaultPredictionFields
 from utils.common_utils import log_set, prepare_dir
 from utils.exceptions import ConfigurationError
 import numpy as np
@@ -58,6 +58,7 @@ def load_from_file(self, conf_path):
                 raise ConfigurationError("%s is not a legal JSON file, please check your JSON format!" % conf_path)
 
         self.tool_version = self.get_item(['tool_version'])
+        self.language = self.get_item(['language'], default='english').lower()
         self.problem_type = self.get_item(['inputs', 'dataset_type']).lower()
         if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
             self.tagging_scheme = self.get_item(['inputs', 'tagging_scheme'], default=None, use_default=True)
@@ -321,6 +322,12 @@ def load_from_file(self, conf_path):
         self.unicode_fix = True if 'unicode_fix' in self.__text_preprocessing else False
         self.remove_stopwords = True if 'remove_stopwords' in self.__text_preprocessing else False
 
+        # tokenzier
+        if self.language == 'chinese':
+            self.tokenizer = self.get_item(['training_params', 'tokenizer'], default='jieba')
+        else:
+            self.tokenizer = self.get_item(['training_params', 'tokenizer'], default='nltk')
+
         if self.extra_feature:
             if self.DBC2SBC:
                 logging.warning("Detect the extra feature %s, set the DBC2sbc is False." % ''.join(list(feature_all-formal_feature)))
@@ -439,6 +446,10 @@ def check_conf(self):
             assert self.predict_data_path is not None, "Please define predict_data_path"
             assert os.path.isfile(self.predict_data_path), "Training data %s does not exist!" % self.predict_data_path
 
+        # check language types
+        SUPPORTED_LANGUAGES = set(LanguageTypes._member_names_)
+        assert self.language in SUPPORTED_LANGUAGES, "Language type %s is not supported now. Supported types: %s" % (self.language, ",".join(SUPPORTED_LANGUAGES))
+
         # check problem types
         SUPPORTED_PROBLEMS = set(ProblemTypes._member_names_)
         assert self.problem_type in SUPPORTED_PROBLEMS, "Data type %s is not supported now. Supported types: %s" % (self.problem_type, ",".join(SUPPORTED_PROBLEMS))

diff --git a/README.md b/README.md
@@ -1,7 +1,13 @@
 # ***NeuronBlocks*** - Building Your NLP DNN Models Like Playing Lego
 
+[![language](https://img.shields.io/badge/language-en%20%7C%20中文-brightgreen.svg)](#language-supported)
+[![python](https://img.shields.io/badge/python-3.6%20%7C%203.7-blue.svg)](https://www.python.org)
+[![pytorch](https://img.shields.io/badge/pytorch-0.4%20%7C%201.x-orange.svg)](https://pytorch.org)
+[![license](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/licenses/MIT)
+
 [简体中文](README_zh_CN.md)
 
+
 # Table of Contents
 * [Overview](#Overview)
 * [Get Started in 60 Seconds](#Get-Started-in-60-Seconds)
@@ -23,6 +29,10 @@ NeuronBlocks consists of two major components: ***Block Zoo*** and ***Model Zoo*
 
 <img src="https://i.imgur.com/LMD0PFQ.png" width="300">
 
+### <span id="language-supported">Language Supported</span>
+- English
+- Chinese
+
 ### NLP Tasks Supported
 - Sentence Classification 
 - Sentiment Analysis 
@@ -53,11 +63,15 @@ Users can either pick existing models (config files) in *Model Zoo* to start mod
     ```bash
     pip install -r requirements.txt
     ```
-    *NeuronBlocks requires **PyTorch 0.4.1** currently* and automatically gets installed by the above on **Linux**.
-
-    For **Windows**, we suggest you to install PyTorch via *Conda* by following the instructions of PyTorch [here](https://pytorch.org/get-started/locally/).
-
 
+3. Install PyTorch (*NeuronBlocks supports **PyTorch 0.4.1** and above*).
+
+    For **Linux**, run the following command:
+    ```bash
+    pip install "torch>=0.4.1"
+    ```
+
+    For **Windows**, we suggest you to install PyTorch via *Conda* by following the instruction of [PyTorch](https://pytorch.org/get-started/locally/).
 
 
 ## <span id="quick-start">Quick Start</span>
@@ -110,7 +124,6 @@ NeuronBlocks operates in an open model. It is designed and developed by **STCA N
 Anyone who are familiar with are highly encouraged to contribute code.
 * Knowledge Distillation for Model Compression. Knowledge distillation for heavy models such as BERT, OpenAI Transformer. Teacher-Student based knowledge distillation is one common method for model compression. 
 * Multi-Lingual Support
-* Chinese Language Support 
 * NER Model Support 
 * Multi-Task Training Support 
 

diff --git a/README_zh_CN.md b/README_zh_CN.md
@@ -1,5 +1,10 @@
 # ***NeuronBlocks*** - 像搭积木一样构建自然语言理解深度学习模型
 
+[![language](https://img.shields.io/badge/language-en%20%7C%20中文-brightgreen.svg)](#language-supported)
+[![python](https://img.shields.io/badge/python-3.6%20%7C%203.7-blue.svg)](https://www.python.org)
+[![pytorch](https://img.shields.io/badge/pytorch-0.4%20%7C%201.x-orange.svg)](https://pytorch.org)
+[![license](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/licenses/MIT)
+
 [English version](README.md)
 
 # 目录
@@ -25,6 +30,9 @@ NeuronBlocks包括 ***Block Zoo*** 和 ***Model Zoo*** 两个重要组件，其
 
 <img src="https://i.imgur.com/LMD0PFQ.png" width="300">
 
+### <span id="language-supported">支持的语言</span>
+- English
+- 中文
 
 ### 支持的NLP任务
 - 句子分类
@@ -46,7 +54,7 @@ NeuronBlocks包括 ***Block Zoo*** 和 ***Model Zoo*** 两个重要组件，其
 # 快速入门
 ## 安装
 
-*注: NeuronBlocks支持 **Python 3.6***
+*注: NeuronBlocks支持 **Python 3.6**及以上*
 
 1. Clone本项目： 
     ```bash
@@ -58,13 +66,13 @@ NeuronBlocks包括 ***Block Zoo*** 和 ***Model Zoo*** 两个重要组件，其
     pip install -r requirements.txt
     ```
 
-3. 安装PyTorch ( *NeuronBlocks支持 **PyTorch 0.4.1** * ).
+3. 安装PyTorch ( *NeuronBlocks支持 **PyTorch 0.4.1** 及以上*):
 
     对于 **Linux** ，运行以下命令：
     ```bash
-    pip install torch==0.4.1
+    pip install "torch>=0.4.1"
     ```
-    对于 **Windows** ，建议按照[PyTorch官方安装教程](https://pytorch.org/get-started/previous-versions/)通过Conda安装PyTorch。
+    对于 **Windows** ，建议按照[PyTorch官方安装教程](https://pytorch.org/get-started/locally/)通过Conda安装PyTorch。
 
 
 ## 快速开始
@@ -115,7 +123,6 @@ NeuronBlocks以开放的模式运行。它由 **微软 STCA NLP Group** 设计
 ## 正在进行的工作
 * 模型压缩，对诸如BERT, OpenAI Transformer之类的复杂模型进行知识蒸馏。基于Teacher-Student的知识蒸馏是模型压缩的一个常用方法。
 * 多语言支持
-* 中文模型支持
 * 命名实体识别模型支持
 * 多任务训练支持
 

diff --git a/Tutorial.md b/Tutorial.md
@@ -6,6 +6,7 @@
 * [Quick Start](#quick-start)
 * [How to Design Your NLP Model](#design-model)
     * [Define the Model Configuration File](#define-conf)
+    * [Chinese Support](#chinese-support)
     * [Visualize Your Model](#visualize)
 * [Model Zoo for NLP Tasks](#model-zoo)
     * [Task 1: Text Classification](#task-1)
@@ -18,6 +19,7 @@
         2. [Compression for Text Matching Model](#task-6.2)
         3. [Compression for Slot Filling Model](#task-6.3)
         4. [Compression for MRC Model](#task-6.4)
+    * [Task 7: Chinese Sentiment Analysis](#task-7)
 * [Advanced Usage](#advanced-usage)
     * [Extra Feature Support](#extra-feature)
     * [Learning Rate Decay](#lr-decay)
@@ -80,6 +82,8 @@ Take *[PROJECTROOT/model_zoo/demo/conf.json](./model_zoo/demo/conf.json)* as an
 The sample data lies in *[PROJECTROOT/dataset/demo/](./dataset/demo/)*.
 
 The architecture of the configuration file is:
+
+- **language**. [optional, default: English] Firstly define language type here, we support English and Chinese now.
 - **inputs**. This part defines the input configuration.
     - ***use_cache***. If *use_cache* is true, the toolkit would make cache at the first time so that we can accelerate the training process at the next time.
     - ***dataset_type***. Declare the task type here. Currently, we support classification, regression and so on.
@@ -145,6 +149,7 @@ The architecture of the configuration file is:
     - ***batch_num_to_show_results***. [necessary for training] During the training process, show the results every batch_num_to_show_results batches.
     - ***max_epoch***. [necessary for training] The maximum number of epochs to train.
     - ***valid_times_per_epoch***. [optional for training, default: 1] Define how many times to conduct validation per epoch. Usually, we conduct validation after each epoch, but for a very large corpus, we'd better validate multiple times in case to miss the best state of our model. The default value is 1.
+    - ***tokenizer***. [optional] Define tokenizer here. Currently, we support 'nltk' and 'jieba'. By default, 'nltk' for English and 'jieba' for Chinese.
 - **architecture**. Define the model architecture. The node is a list of layers (blocks) in block_zoo to represent a model. The supported layers of this toolkit are given in [block_zoo overview](https://microsoft.github.io/NeuronBlocks). 
 
     - ***Embedding layer***. The first layer of this example (as shown below) defines the embedding layer, which is composed of one type of embedding: "word" (word embedding) and the dimension of "word" are 300.  You need to keep this dimension and the dimension of pre-trained embeddings consistent if you specify pre-trained embeddings in *inputs/data_paths/pre_trained_emb*.
@@ -195,8 +200,14 @@ The architecture of the configuration file is:
 *Tips: The [optional] and [necessary] mark means corresponding node in the configuration file is optional or necessary for training/test/prediction. If there is no mark, it means the node is necessary all the time. Actually, it would be more convenient to prepare a configuration file that contains all the configurations for training, test and prediction.*
 
 
+### <span id="chinese-support">Chinese Support</span>
+
+When using Chinese data, *language* in JSON config should be set to 'Chinese'. By default, Chinese uses the jieba tokenizer. For an example, see [Task 7: Chinese Sentiment Analysis](#task-7).
+
+In addition, we also support pre-trained Chinese word vectors. Firstly download word vectors from [Chinese Word Vectors](https://github.com/Embedding/Chinese-Word-Vectors#pre-trained-chinese-word-vectors) and *bunzip* , then place it in a directory  (e.g. *dataset/chinese_word_vectors/*). Finally remember to define *inputs/data_paths/pre_trained_emb* in JSON config.
 
-## <span id="visualize">Visualize Your Model</span>
+
+### <span id="visualize">Visualize Your Model</span>
 
 A model visualizer is provided for visualization and configuration correctness checking, please refer to [Model Visualizer README](./model_visualizer/README.md).
 
@@ -266,12 +277,12 @@ Question answer matching is a crucial subtask of the question answering problem,
     2. Train question answer matching model.
     ```bash
     cd PROJECT_ROOT
-    python train.py --conf_path=model_zoo/nlp_tasks/question_answer_matching/conf_question_answer_matching_bilstm.json
+    python train.py --conf_path=model_zoo/nlp_tasks/question_answer_matching/conf_question_answer_matching_bilstm_match_attention.json
     ```
     3. Test your model.
     ```bash
     cd PROJECT_ROOT
-    python test.py --conf_path=model_zoo/nlp_tasks/question_answer_matching/conf_question_answer_matching_bilstm.json
+    python test.py --conf_path=model_zoo/nlp_tasks/question_answer_matching/conf_question_answer_matching_bilstm_match_attention.json
     ```
 
      *Tips: you can try different models by running different JSON config files.*
@@ -285,6 +296,7 @@ Question answer matching is a crucial subtask of the question answering problem,
      CNN (NeuronBlocks) | 0.747 
      BiLSTM (NeuronBlocks) | 0.767 
      BiLSTM+Attn (NeuronBlocks) | 0.754 
+     BiLSTM+Match Attention (NeuronBlocks) | 0.785
 
     *Tips: the model file and train log file can be found in JOSN config file's outputs/save_base_dir after you finish training.*
 
@@ -501,6 +513,28 @@ This task is to train a query-passage regression model to learn from a heavy tea
 #### <span id="task-6.3">6.3: Compression for Slot Filling Model (ongoing)</span>
 #### <span id="task-6.4">6.4: Compression for MRC (ongoing)</span>
 
+### <span id="task-7">Task 7: Chinese Sentiment Analysis</span>
+
+Here is an example using Chinese data, for sentiment analysis task.
+
+- ***Dataset***
+
+    *PROJECT_ROOT/dataset/chinese_sentiment_analysis* is sample data of Chinese sentiment analysis.
+
+- ***Usage***
+
+    1. Train Chinese sentiment analysis model.
+    ```bash
+    cd PROJECT_ROOT
+    python train.py --conf_path=model_zoo/nlp_tasks/chinese_sentiment_analysis/conf_chinese_sentiment_analysis_bilstm.json
+    ```
+    2. Test your model.
+    ```bash
+    cd PROJECT_ROOT
+    python test.py --conf_path=model_zoo/nlp_tasks/chinese_sentiment_analysis/conf_chinese_sentiment_analysis_bilstm.json
+    ```
+     *Tips: you can try different models by running different JSON config files. The model file and train log file can be found in JOSN config file's outputs/save_base_dir after you finish training.*
+
 
 ## <span id="advanced-usage">Advanced Usage</span>