In [2]:
from janome.tokenizer import Tokenizer

t = Tokenizer()
sentences = [
    'おいしいビールを飲む', 'コーヒーを飲む', 'おいしいクラフトビールを買う'
]

words_list = []
for sentence in sentences:
    words_list.append(t.tokenize(sentence, wakati=True))
words_list

[['おいしい', 'ビール', 'を', '飲む'],
 ['コーヒー', 'を', '飲む'],
 ['おいしい', 'クラフト', 'ビール', 'を', '買う']]

In [3]:
unique_words = []
for words in words_list:
    for word in words:
        if word not in unique_words:
            unique_words.append(word)
unique_words

['おいしい', 'ビール', 'を', '飲む', 'コーヒー', 'クラフト', '買う']

In [4]:
bow_list = []
for words in words_list:
    bag_of_words = []
    for unique_word in unique_words:
        num = words.count(unique_word)
        bag_of_words.append(num)
    bow_list.append(bag_of_words)
bow_list

[[1, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 1, 1]]

In [5]:
from math import log
num_of_sentences = len(sentences)
idf = []
for i in range(len(unique_words)):
    count = 0
    for bow in bow_list:
        if bow[i] > 0:
            count += 1
    idf.append(log((num_of_sentences + 1) / (count + 1)))
idf

[0.28768207245178085,
 0.28768207245178085,
 0.0,
 0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453]

In [6]:
bow = bow_list[1]
num_of_words = sum(bow)
tfidf = []
for i, value in enumerate(bow):
    tf = value / num_of_words
    tfidf.append(tf * (idf[i] + 1))
tfidf

[0.0,
 0.0,
 0.3333333333333333,
 0.42922735748392693,
 0.5643823935199818,
 0.0,
 0.0]

In [7]:
from janome.tokenizer import Tokenizer

BEGIN = '__BEGIN__'
END = '__END__'

sentence = 'おいしいビールを飲もう'

t = Tokenizer()
words = t.tokenize(sentence, wakati=True)
words = [BEGIN] + words + [END]

three_words_list = []
for i in range(len(words) - 2):
    three_words_list.append(words[i:i+3])
three_words_list

[['__BEGIN__', 'おいしい', 'ビール'],
 ['おいしい', 'ビール', 'を'],
 ['ビール', 'を', '飲も'],
 ['を', '飲も', 'う'],
 ['飲も', 'う', '__END__']]

In [14]:
from collections import Counter

def get_three_words_list(sentence):
    """文章を３単語の組にして返す"""
    t = Tokenizer()
    words = t.tokenize(sentence, wakati=True)
    words = [BEGIN] + words + [END]
    three_words_list = []
    for i in range(len(words) - 2):
        three_words_list.append(tuple(words[i:i+3]))
    return three_words_list
    
sentences = ['おいしいビールを飲もう', 'ビールを飲もう', 'おいしいビールは生']
three_words_list = []
for sentence in sentences:
    three_words_list += get_three_words_list(sentence)
three_words_count = Counter(three_words_list)
three_words_count

Counter({('__BEGIN__', 'おいしい', 'ビール'): 2,
         ('おいしい', 'ビール', 'を'): 1,
         ('ビール', 'を', '飲も'): 2,
         ('を', '飲も', 'う'): 2,
         ('飲も', 'う', '__END__'): 2,
         ('__BEGIN__', 'ビール', 'を'): 1,
         ('おいしい', 'ビール', 'は'): 1,
         ('ビール', 'は', '生'): 1,
         ('は', '生', '__END__'): 1})

In [9]:
def generate_markov_dict(three_words_count):
    """マルコフ連鎖での文章生成用の辞書データを生成する"""
    markov_dict = {}
    for three_words, count in three_words_count.items():
        two_words = three_words[:2]
        next_word = three_words[2]
        if two_words not in markov_dict:
            markov_dict[two_words] = {'words': [], 'weights': []}
        markov_dict[two_words]['words'].append(next_word)
        markov_dict[two_words]['weights'].append(count)
    return markov_dict
markov_dict = generate_markov_dict(three_words_count)
markov_dict

{('__BEGIN__', 'おいしい'): {'words': ['ビール'], 'weights': [2]},
 ('おいしい', 'ビール'): {'words': ['を', 'は'], 'weights': [1, 1]},
 ('ビール', 'を'): {'words': ['飲も'], 'weights': [2]},
 ('を', '飲も'): {'words': ['う'], 'weights': [2]},
 ('飲も', 'う'): {'words': ['__END__'], 'weights': [2]},
 ('__BEGIN__', 'ビール'): {'words': ['を'], 'weights': [1]},
 ('ビール', 'は'): {'words': ['生'], 'weights': [1]},
 ('は', '生'): {'words': ['__END__'], 'weights': [1]}}

In [10]:
sentence = 'あえいうえおあお'
d = {}
for char in sentence:
    if char in d:
        d[char] += 1
    else:
        d[char] = 1
d

{'あ': 2, 'え': 2, 'い': 1, 'う': 1, 'お': 2}

In [11]:
from collections import defaultdict
sentence = 'あえいうえおあお'
dd = defaultdict(int)
for char in sentence:
    dd[char] += 1
dd

defaultdict(int, {'あ': 2, 'え': 2, 'い': 1, 'う': 1, 'お': 2})

In [16]:
from collections import defaultdict

def get_first_word_and_count(three_words_count):
    """最初の単語を選択するための辞書データを作成する"""
    first_word_count = defaultdict(int)
    
    for three_words, count in three_words_count.items():
        if three_words[0] == BEGIN:
            next_word = three_words[1]
            first_word_count[next_word] += count
    return first_word_count

get_first_word_and_count(three_words_count)

defaultdict(int, {'おいしい': 2, 'ビール': 1})

In [17]:
def get_first_words_weights(three_words_count):
    """最初の単語と重みのリストを作成する"""
    first_word_count = get_first_word_and_count(three_words_count)
    words = []
    weights = []
    for word, count in first_word_count.items():
        words.append(word)
        weights.append(count)
    
    return words, weights

first_words, first_weights = get_first_words_weights(three_words_count)
first_words, first_weights

(['おいしい', 'ビール'], [2, 1])

In [18]:
import random

def generate_text(first_words, first_weights, markov_dict):
    """入力された辞書データを元に文章を生成する"""
    first_word = random.choices(first_words, weights=first_weights[0])
    generate_words = [BEGIN, first_word]
    while True:
        pair = tuple(generate_words[-2:])
        words = markov_dict[pair]['words']
        weights = markov_dict[pair]['weights']
        next_word = random.choices(words, weights=weights)[0]
        if next_word == END:
            break
        generate_words.append(next_word)
    return '' .join(generate_words[1:])

In [19]:
for _ in range(5):
    text = generate_text(first_words, first_weights, markov_dict)
    print(text)

TypeError: 'int' object is not iterable