In [73]:
import jieba
import re
import jieba.posseg as pseg

# Demonstration of inference in HMM. 
`./data/initial_vector.txt` contains initial probability  $\pi$   
`./data/emit_probability.txt` contains the transition probabilities $A$  
`./data/emit_probability.txt` contains the emission probability $p(y_{t+1}|y_t)$  
This is a program for recognizing chinese organization, the calculation of above probability is based on `People's daily`


In [114]:
import jieba
class OrgRecognize:
    def __init__(self):
        self.hidden_states = ["A", "B", "C", "D","F","G","I","J","K","L","M","P","S","W","X","Z"]
        self.initial_vector = self.load_initial_vector()
        self.transision_matrix = self.load_transition_matrix(hidden_states=self.hidden_states)
        self.emission_matrix = self.load_emission_matrix(hidden_states=self.hidden_states)

    def load_patterns(self):
        """
        organization pattern
        :return: list all patterns
        """
        result = []
        with open("./data/nt.pattern.txt", "r") as file:
            datas = file.readlines()
            for line in datas:
                result.append(line.strip())
        return result

    def load_transition_matrix(self,hidden_states):
        """
        Load transition matrix
        :return: dict：key first state，value dict--key is next state，value is the corresponding probability
        """
        result = {x: {} for x in hidden_states}
        with open("./data/transition_probability.txt","r") as file:
            datas = file.readlines()
            for line in datas:
                split_line = line.strip().split(",")
                result[split_line[0]][split_line[1]] =  split_line[2]
        return result
    
    def load_initial_vector(self):
        """
        Load initial probabilities
        :return: dict：key is the hidden state，value is the probability
        """
        result = {}
        with open("./data/initial_vector.txt","r") as file:
            datas = file.readlines()
            for line in datas:
                split_line = line.strip().split(",")
                result[split_line[0]] =  split_line[2]
        return result

    def load_emission_matrix(self,hidden_states):
        """
        Load emission matrix
        :param hidden_states: list of hidden state
        :return: dict：key is the hidden state，value is a dict key is observable var，value is the probability
        """
        result = {x:{} for x in hidden_states}
        with open("./data/emit_probability.txt","r") as file:
            datas = file.readlines()
            for line in datas:
                split_line = line.strip().split(",")
                result[split_line[0]][split_line[1]] = split_line[2]
        return  result
    def get_observed_states(self,sentence):
        return sentence

    def viterbi(self, input_sentence):
        """
        Inference the best hidden state sequence
        input_setence: observable tokens
        :return: best state sequence
        """
        hidden_states=self.hidden_states
        initial_probability=self.initial_vector
        transition_probability=self.transision_matrix
        emit_probability=self.emission_matrix
        observed_states = self.get_observed_states(sentence=input_sentence)
        self.observed_states = observed_states
        result = []
        compute_recode = [] #记录每一次的计算结果
        #初始化
        tmp_result = {}
        for state in hidden_states:
            if observation[0] in emit_probability[state] :
                tmp_result[state] = eval(initial_probability[state])*eval(emit_probability[state][observation[0]])
            else:
                tmp_result[state] = 0
        compute_recode.append(tmp_result)

        #对于之后的词语，继续计算
        for index,word in enumerate(observation[1:]):
            tmp_result = {}
            for current_state in hidden_states:
                #取最大值：上一次的所有状态(x)*转移到当前状态（current_state）*发射概率
                if word in emit_probability[current_state]:
                    tmp_result[current_state] = max([compute_recode[index][x]*eval(transition_probability[x][current_state])*
                                                              eval(emit_probability[current_state][word]) for x in hidden_states])
                else:
                    tmp_result[current_state] = 0
            compute_recode.append(tmp_result)

        #返回概率最大的标签序列
        tag_sequence = []
        for recode in compute_recode:
            tag_sequence.append(max(recode, key=recode.get))
        return tag_sequence
    def get_organization(self, observation, sequence, patterns):
        """
        得到识别的机构名
        :param observation: 单词序列
        :param sequence: 标注序列
        :param patterns: 模式串
        :return: list，机构名
        """
        org_indices = []  # 存放机构名的索引
        orgs = [] # 存放机构名字符串
        tag_sequence_str = "".join(tag_sequence)  # 转为字符串
        for pattern in patterns:
            if pattern in tag_sequence_str:
                start_index = (tag_sequence_str.index(pattern))
                end_index = start_index + len(pattern)
                org_indices.append([start_index,end_index])
        if len(org_indices)!=0:
            for start,end in org_indices:
                orgs.append("".join(observation[start:end]))
        return orgs

    
def ner(orgrecog, setence):
    sentence = ["始##始"]
    sentence.extend(list(jieba.cut(sentence_str)))
    sentence.append("末##末")
    print(sentence)
    tag_sequence = orgrecog.viterbi(sentence)
    print( tag_sequence )
    patterns = orgrecog.load_patterns()
    results = orgrecog.get_organization(sentence,tag_sequence,patterns)
    
    if len(results) == 0:
        print ("未识别到机构名")
        print (tag_sequence)
    else:
        print('===识别的实体===')
        for result in results:
            print (result)
    print('-'*80)
            
if __name__ == '__main__':
    orgrecog = OrgRecognize()  
    # Note the corpus is too old, so some new organization cannot be recognized
    sentence_str = "人民日报出版社头版头条"
    ner(orgrecog, sentence_str)
    sentence_str = "新中国成立啦"
    ner(orgrecog, sentence_str)
    sentence_str = "中国移动研究所在京召开大会"
    ner(orgrecog, sentence_str)


['始##始', '人民日报', '出版社', '头版头条', '末##末']
['S', 'I', 'D', 'A', 'A']
===识别的实体===
人民日报出版社
--------------------------------------------------------------------------------
['始##始', '新', '中国', '成立', '啦', '末##末']
['S', 'I', 'D', 'A', 'A']
===识别的实体===
新中国
--------------------------------------------------------------------------------
['始##始', '中国移动', '研究所', '在', '京', '召开大会', '末##末']
['S', 'I', 'D', 'A', 'A']
===识别的实体===
中国移动研究所
--------------------------------------------------------------------------------


# Demonstration of CRF for sequence labeling
reference: [link](http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/)  
The task is to identify country names, the code include training and test.

In [135]:
import codecs
import numpy as np
import nltk
import pycrfsuite
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Read data file and parse the XML
with codecs.open("n3-collection/reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)


data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])


def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not 
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]


X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

trainer = pycrfsuite.Trainer(verbose=False)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

# Generate predictions
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
outs = []
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    outs.append("%s (%s)" % (y, x))
print(' || '.join(outs))

# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

blocker (N) || energy (N) || corp (N) || said (I) || an (I) || offering (I) || of (I) || 20 (I) || mln (I) || common (I) || shares (I) || is (I) || underway (I) || at (I) || 2.625 (I) || dlrs (I) || per (I) || share (I) || through (I) || underwriters (I) || led (I) || by (I) || drexel (N) || burnham (N) || lambert (N) || inc (N) || and (I) || alex. (N) || brown (N) || and (N) || sons (N) || inc (N) || absb. (I) || the (I) || company (I) || is (I) || offering (I) || 19.7 (I) || mln (I) || shares (I) || and (I) || shareholders (I) || the (I) || rest. (I) || before (I) || the (I) || offering (I) || it (I) || had (I) || about (I) || 33.6 (I) || mln (I) || shares (I) || outstanding. (I)
              precision    recall  f1-score   support

           I       0.98      0.98      0.98      2805
           N       0.85      0.89      0.87       437

   micro avg       0.96      0.96      0.96      3242
   macro avg       0.92      0.93      0.92      3242
weighted avg       0.96      0.96    

# Demonstration of PLSA
Here we provide a implementation of PLSA from [link](https://github.com/isnowfy/plsa.git)

In [141]:

import math
import operator
import random
import gzip
import sys
import marshal
from functools import reduce

def cos_sim(p, q):
    sum0 = sum(map(lambda x:x*x, p))
    sum1 = sum(map(lambda x:x*x, q))
    sum2 = sum(map(lambda x:x[0]*x[1], zip(p, q)))
    return sum2/(sum0**0.5)/(sum1**0.5)

def _rand_mat(sizex, sizey):
    ret = []
    for i in range(sizex):
        ret.append([])
        for _ in range(sizey):
            ret[-1].append(random.random())
        norm = sum(ret[-1])
        for j in range(sizey):
            ret[-1][j] /= norm
    return ret

class Plsa:

    def __init__(self, corpus, topics=2):
        self.topics = topics
        self.corpus = corpus
        self.docs = len(corpus)
        self.each = list(map(sum, map(lambda x:x.values(), corpus)))
        self.words = max(reduce(operator.add, map( lambda x:list(x.keys()), corpus)))+1
        self.likelihood = 0
        self.zw = _rand_mat(self.topics, self.words)
        self.dz = _rand_mat(self.docs, self.topics)
        self.dw_z = None
        self.p_dw = []
        self.beta = 0.8

    def save(self, fname, iszip=True):
        d = {}
        for k, v in self.__dict__.items():
            if hasattr(v, '__dict__'):
                d[k] = v.__dict__
            else:
                d[k] = v
        if sys.version_info[0] == 3:
            fname = fname + '.3'
        if not iszip:
            marshal.dump(d, open(fname, 'wb'))
        else:
            f = gzip.open(fname, 'wb')
            f.write(marshal.dumps(d))
            f.close()

    def load(self, fname, iszip=True):
        if sys.version_info[0] == 3:
            fname = fname + '.3'
        if not iszip:
            d = marshal.load(open(fname, 'rb'))
        else:
            try:
                f = gzip.open(fname, 'rb')
                d = marshal.loads(f.read())
            except IOError:
                f = open(fname, 'rb')
                d = marshal.loads(f.read())
            f.close()
        for k, v in d.items():
            if hasattr(self.__dict__[k], '__dict__'):
                self.__dict__[k].__dict__ = v
            else:
                self.__dict__[k] = v

    def _cal_p_dw(self):
        self.p_dw = []
        for d in range(self.docs):
            self.p_dw.append({})
            for w in self.corpus[d]:
                tmp = 0
                for _ in range(self.corpus[d][w]):
                    for z in range(self.topics):
                        tmp += (self.zw[z][w]*self.dz[d][z])**self.beta
                self.p_dw[-1][w] = tmp

    def _e_step(self):
        self._cal_p_dw()
        self.dw_z = []
        for d in range(self.docs):
            self.dw_z.append({})
            for w in self.corpus[d]:
                self.dw_z[-1][w] = []
                for z in range(self.topics):
                    self.dw_z[-1][w].append(((self.zw[z][w]*self.dz[d][z])**self.beta)/self.p_dw[d][w])

    def _m_step(self):
        for z in range(self.topics):
            self.zw[z] = [0]*self.words
            for d in range(self.docs):
                for w in self.corpus[d]:
                    self.zw[z][w] += self.corpus[d][w]*self.dw_z[d][w][z]
            norm = sum(self.zw[z])
            for w in range(self.words):
                self.zw[z][w] /= norm
        for d in range(self.docs):
            self.dz[d] = [0]*self.topics
            for z in range(self.topics):
                for w in self.corpus[d]:
                    self.dz[d][z] += self.corpus[d][w]*self.dw_z[d][w][z]
            for z in range(self.topics):
                self.dz[d][z] /= self.each[d]

    def _cal_likelihood(self):
        self.likelihood = 0
        for d in range(self.docs):
            for w in self.corpus[d]:
                self.likelihood += self.corpus[d][w]*math.log(self.p_dw[d][w])

    def train(self, max_iter=100):
        cur = 0
        for i in range(max_iter):
            print ('%d iter' % i)
            self._e_step()
            self._m_step()
            self._cal_likelihood()
            print ('likelihood %f ' % self.likelihood)
            if cur != 0 and abs((self.likelihood-cur)/cur) < 1e-8:
                break
            cur = self.likelihood

    def inference(self, doc, max_iter=100):
        doc = dict(filter(lambda x:x[0]<self.words, doc.items()))
        words = sum(doc.values())
        ret = []
        for i in range(self.topics):
            ret.append(random.random())
        norm = sum(ret)
        for i in range(self.topics):
            ret[i] /= norm
        tmp = 0
        for _ in range(max_iter):
            p_dw = {}
            for w in doc:
                p_dw[w] = 0
                for _ in range(doc[w]):
                    for z in range(self.topics):
                        p_dw[w] += (ret[z]*self.zw[z][w])**self.beta
            # e setp
            dw_z = {}
            for w in doc:
                dw_z[w] = []
                for z in range(self.topics):
                    dw_z[w].append(((self.zw[z][w]*ret[z])**self.beta)/p_dw[w])
            # m step
            ret = [0]*self.topics
            for z in range(self.topics):
                for w in doc:
                    ret[z] += doc[w]*dw_z[w][z]
            for z in range(self.topics):
                ret[z] /= words
            # cal likelihood
            likelihood = 0
            for w in doc:
                likelihood += doc[w]*math.log(p_dw[w])
            if tmp != 0 and abs((likelihood-tmp)/tmp) < 1e-8:
                break
            tmp = likelihood
        return ret

    def post_prob_sim(self, docd, q):
        sim = 0
        for w in docd:
            tmp = 0
            for z in range(self.topics):
                tmp += self.zw[z][w]*q[z]
            sim += docd[w]*math.log(tmp)
        return sim

######### unittest #################################



def test_train():
    corpus = [{0:2,3:5},{0:5,2:1},{1:2,4:5}]
    p = Plsa(corpus)
    p.train()
    assert cos_sim(p.dz[0], p.dz[1])>cos_sim(p.dz[0], p.dz[2])
    assert p.post_prob_sim(p.corpus[0], p.dz[1])>p.post_prob_sim(p.corpus[0], p.dz[2])

def test_inference():
    corpus = [{0:2,3:5},{0:5,2:1},{1:2,4:5}]
    p = Plsa(corpus)
    p.train()
    z = p.inference({0:4, 6:7})
    assert abs(cos_sim(p.dz[0], p.dz[1])-cos_sim(p.dz[0], z))<1e-8
test_train()
test_inference()


0 iter
likelihood 2.123584 
1 iter
likelihood -12.995615 
2 iter
likelihood -11.750363 
3 iter
likelihood -10.234484 
4 iter
likelihood -8.967368 
5 iter
likelihood -8.107269 
6 iter
likelihood -7.478987 
7 iter
likelihood -7.054530 
8 iter
likelihood -6.857159 
9 iter
likelihood -6.811286 
10 iter
likelihood -6.807646 
11 iter
likelihood -6.807588 
12 iter
likelihood -6.807588 
13 iter
likelihood -6.807588 
0 iter
likelihood 4.392996 
1 iter
likelihood -14.747961 
2 iter
likelihood -14.542116 
3 iter
likelihood -14.353453 
4 iter
likelihood -14.086266 
5 iter
likelihood -13.601759 
6 iter
likelihood -12.697529 
7 iter
likelihood -11.290563 
8 iter
likelihood -9.747169 
9 iter
likelihood -8.513129 
10 iter
likelihood -7.593646 
11 iter
likelihood -7.028783 
12 iter
likelihood -6.834483 
13 iter
likelihood -6.808457 
14 iter
likelihood -6.807592 
15 iter
likelihood -6.807588 
16 iter
likelihood -6.807588 


# Demonstration of LDA  for text analysis

Analyze the topic in chapters of 《Journey to the West》  

In [115]:
def load_weicheng(filename='story.txt'):
    lines = open(filename).readlines()
    books = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('第') and line.endswith('章'):
            books.append(line)
        else:
            books[-1] += line
    for i in range(len(books)):
        books[i] = ' '.join(jieba.cut(books[i]))
    return books

def load_honglou(filename='honglou.txt'):
    lines = open(filename).readlines()
    book = [[]]
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if '书香屋' in line:
            book[-1].pop()
            book.append([])
        else:
            book[-1].append(line)
    return [' '.join(jieba.cut(''.join(chp))) for chp in book]

def load_xiyou(filename='xiyouji_wuchengen.txt'):
    
    def _is_sep(line):
        pat = re.compile(u'第.{1,3}回')
        if pat.search(line):
            return True
        return False
    
    lines = open(filename).readlines()
    book = [[]]
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if _is_sep(line):
            book.append([])
        else:
            book[-1].append(line)
    print(len(book))
    book = [''.join(chp) for chp in book if len(chp)]
    book_noun = []
    for chp in book:
        words = pseg.cut(chp)
        book_noun.append( ' '.join([word for word, flag in words if flag[0] == 'n']))
        # book_noun.append( ' '.join(jieba.cut(chp)))
    return book_noun

In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx) + " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]) )


documents = load_xiyou()
print()
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10


# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=20, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 20
print('\nLDA')
display_topics(lda, tf_feature_names, no_top_words)

122






LDA
Topic 0:三藏 师父 行者 形容 大圣 这里 一个 八戒 意思 后来 不知 唐僧 那里 不是 所以 古代 小龙 土地 故事 甚么
Topic 1:行者 八戒 师父 三藏 一个 大圣 沙僧 妖精 那里 怎么 唐僧 我们 不知 不是 菩萨 和尚 呆子 两个 只见 三个
Topic 2:意思 这里 佛教 道教 形容 后来 古代 一种 传说 称为 故事 比喻 神仙 所以 一个 这是 就是 地方 又称 认为
Topic 3:行者 八戒 师父 三藏 一个 唐僧 沙僧 怎么 那里 我们 大圣 和尚 妖精 不知 两个 菩萨 甚么 长老 不是 国王
Topic 4:菩萨 太宗 御弟 南无 女王 袈裟 玄奘 三藏 长老 唐王 太师 圣僧 法师 行者 真经 锡杖 唐僧 取经 师父 徒弟
Topic 5:大圣 菩萨 行者 天王 悟空 玉帝 一个 猴王 如来 不知 太宗 只见 那里 如何 两个 太子 大王 陛下 哪吒 龙王
Topic 6:行者 菩萨 一个 这里 八戒 甚么 师父 唐僧 称为 就是 不知 袈裟 乃是 后来 猴王 两个 原来 那个 一种 今日
Topic 7:八戒 国王 公主 三藏 师父 长老 唐僧 行者 菩萨 沙僧 两个 徒弟 和尚 陛下 如何 意思 不敢 不知 驸马 一个
Topic 8:光蕊 玄奘 丞相 婆婆 母亲 龙王 我儿 和尚 打死 一个 师父 今日 夫人 只见 父母 长老 唐王 孩儿 夜叉 报仇
Topic 9:八戒 行者 菩萨 师父 一个 佛教 大圣 意思 这里 三藏 后来 悟空 怎么 取经 甚么 不知 不是 故事 形容 沙僧
