# comparison of jieba, spaCy, and NLTK

1. Core Features & Focus
* jieba: Chinese text segmentation (exact/full/search modes), POS tagging, keyword extraction. Optimized for **Chinese text processing**.
* spaCy: Tokenization, POS tagging, dependency parsing, NER, text classification, lemmatization. Industrial-strength **multilingual NLP**.
* NLTK: Tokenization, POS tagging, parsing, stemming, corpus tools, classic NLP algorithms. **Education and research**.

2. Language Support
* jieba: **Chinese only**.
* spaCy: Requires Chinese model, 20+ languages (English, German, French, etc.).
* NLTK: Limited (needs external tools like Stanford Segmenter), Supports many languages via third-party resources.

### Summary
* jieba: Best for Chinese text segmentation with high speed and simplicity.
* spaCy: Ideal for production-ready multilingual NLP with deep learning integration.
* NLTK: Perfect for education and algorithm experimentation.
Choose based on your project's language, scale, and complexity! 🚀

In [4]:
import re
import nltk
import pickle
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


# nltk.download('punkt')
# nltk.download('stopwords')


def clean_text(text):
    # 1. 去除HTML标签
    text = re.sub(r'<[^>]+>', ' ', text)

    # 2. 处理特殊字符和数字
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 保留字母和空格
    text = re.sub(r'\d+', '<NUM>', text)  # 替换数字为特殊标记

    # 3. 转换为小写
    text = text.lower()

    # 4. 分词
    tokens = word_tokenize(text)

    # 5. 去除停用词
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 6. 词干提取
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # 7. 过滤单字符
    tokens = [word for word in tokens if len(word) > 1]

    return tokens


def build_vocab(dataset_path, min_freq=5):
    # 统计词频
    word_freq = defaultdict(int)

    # 假设数据集是每行一个评论的文本文件
    with open(dataset_path, 'r', encoding='utf-8') as f:
        for line in f:
            cleaned_tokens = clean_text(line)
            for token in cleaned_tokens:
                word_freq[token] += 1

    # 过滤低频词
    filtered_words = [word for word, freq in word_freq.items() if freq >= min_freq]

    # 添加特殊符号
    special_tokens = ['<PAD>', '<UNK>']
    vocab = special_tokens + sorted(filtered_words)

    # 创建映射字典
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for word, idx in word2idx.items()}

    # # 保存词表
    # with open('vocab.pkl', 'wb') as f:
    #     pickle.dump((word2idx, idx2word), f)

    # 同时保存可读的TXT版本
    with open('vocab.txt', 'w', encoding='utf-8') as f:
        for word in vocab:
            f.write(f"{word}\n")

    return word2idx, idx2word


# 使用示例
word2idx, idx2word = build_vocab('sample.positive.txt')