In [1]:
import random

In [39]:
douban_comment = '''
comment = prefix this compare others evaluate
prefix = 平心而论 | 实话实说
this = 这部电影
compare = 比 | 相比
others = other others | other
other = 前作 | 国外版本 | 其他版本
evaluate = 好多了 | 差很多
'''

In [37]:
apple_genius = '''
answer = prefix action suffix 
prefix = 我们给您
action = 换个新的 | 送修 
suffix = 请出示您的保修卡
'''

In [7]:
def get_generation_by_gram(grammar_str: str, target, stmt_split='=', or_split='|'):

    rules = dict() # key is the @statement, value is @expression
    for line in grammar_str.split('\n'):
        if not line: continue
        # skip the empty line
        #  print(line)
        stmt, expr = line.split(stmt_split)
        #print(stmt)
        #print(expr)
        rules[stmt.strip()] = expr.split(or_split)
    
    generated = generate(rules, target=target)
    
    return generated

In [5]:
def generate(grammar_rule, target):
    if target in grammar_rule: # names 
        candidates = grammar_rule[target]  # ['name names', 'name']
        candidate = random.choice(candidates) #'name names', 'name'
        return ''.join(generate(grammar_rule, target=c.strip()) for c in candidate.split())
    else:
        return target

In [40]:
get_generation_by_gram(douban_comment, target='comment', stmt_split='=')

'平心而论这部电影比其他版本好多了'

In [41]:
get_generation_by_gram(apple_genius, target='answer', stmt_split='=')

'我们给您换个新的请出示您的保修卡'

In [59]:
def generate_n(grammar_str, target, n):
    results = []
    for i in range(n):
        print(get_generation_by_gram(grammar_str, target))
        results.append(get_generation_by_gram(grammar_str, target))
    return results

In [42]:
generate_n(douban_comment, 'comment', 5)

实话实说这部电影相比其他版本前作国外版本好多了
实话实说这部电影相比前作前作好多了
平心而论这部电影比其他版本差很多
平心而论这部电影比前作国外版本国外版本国外版本国外版本好多了
平心而论这部电影相比其他版本国外版本国外版本国外版本差很多


In [25]:
import jieba

In [33]:
def cut(comments):
    stop_words = {',','.','。','，','《', '》', '!','！','?','？',':','：','','~','…',' ','、'}
    result = []
    for i in range(len(comments)):
        sentence = comments[i]
        try:
            words = jieba.cut(sentence)
            for word in words:
                if word not in stop_words:
                    result.append(word)
        except:
            print(sentence)
    return result

In [21]:
import pandas as pd
file = pd.read_csv('https://raw.githubusercontent.com/Computing-Intelligence/datasource/master/movie_comments.csv')
file.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,link,name,comment,star
0,1,https://movie.douban.com/subject/26363254/,战狼2,吴京意淫到了脑残的地步，看了恶心想吐,1
1,2,https://movie.douban.com/subject/26363254/,战狼2,首映礼看的。太恐怖了这个电影，不讲道理的，完全就是吴京在实现他这个小粉红的英雄梦。各种装备轮...,2
2,3,https://movie.douban.com/subject/26363254/,战狼2,吴京的炒作水平不输冯小刚，但小刚至少不会用主旋律来炒作…吴京让人看了不舒服，为了主旋律而主旋...,2
3,4,https://movie.douban.com/subject/26363254/,战狼2,凭良心说，好看到不像《战狼1》的续集，完虐《湄公河行动》。,4
4,5,https://movie.douban.com/subject/26363254/,战狼2,中二得很,1


In [34]:
TOKENS = cut(file['comment'])

nan
nan


In [35]:
from collections import Counter
words_count = Counter(TOKENS)
words_count.most_common(20)

[('的', 328305),
 ('了', 102408),
 ('是', 72749),
 ('我', 49980),
 ('都', 36283),
 ('很', 34681),
 ('看', 34366),
 ('电影', 33753),
 ('也', 32055),
 ('和', 31316),
 ('在', 31243),
 ('不', 28449),
 ('有', 27965),
 ('就', 25662),
 ('人', 24218),
 ('好', 23231),
 ('啊', 20859),
 ('还', 17504),
 ('一个', 17343),
 ('你', 17229)]

In [43]:
_2_gram_words = [
    TOKENS[i] + TOKENS[i+1] for i in range(len(TOKENS)-1)
]

In [44]:
_2_gram_word_counts = Counter(_2_gram_words)

In [45]:
def get_gram_count(word, wc):
    if word in wc: return wc[word]
    else:
        return wc.most_common()[-1][-1]

In [56]:
def two_gram_model(sentence):
    # 2-gram langauge model
    stop_words = {',','.','。','，','《', '》', '!','！','?','？',':','：','','~','…',' ','、'}
    tokens = [word for word in jieba.cut(sentence) if word not in stop_words]
    probability = 1
    
    for i in range(len(tokens)-1):
        word = tokens[i]
        next_word = tokens[i+1]
        
        _two_gram_c = get_gram_count(word+next_word, _2_gram_word_counts)
        _one_gram_c = get_gram_count(next_word, words_count)
        pro =  _two_gram_c / _one_gram_c
        
        probability *= pro
    
    return probability

In [58]:
two_gram_model('实话实说这部电影相比前作前作好多了')

5.105302276553972e-14

In [60]:
def generate_best(grammar_str, target, n):
    sentence_list = []
    sentences = generate_n(grammar_str, target, n)
    for sentence in sentences:
        sentence_list.append((sentence, two_gram_model(sentence)))
    sorted_list = sorted(sentence_list, key=lambda x: x[1], reverse=True)
    return sorted_list[0][0]

In [61]:
generate_best(douban_comment, 'comment', 5)

实话实说这部电影相比前作差很多
平心而论这部电影相比其他版本好多了
平心而论这部电影相比国外版本前作好多了
实话实说这部电影比前作其他版本好多了
平心而论这部电影比其他版本其他版本前作差很多


'平心而论这部电影相比其他版本好多了'