## 文のベクトル表現

### 例題2.5

In [1]:
from collections import Counter
from numpy.linalg import norm
import numpy as np

s1 = "the pen is mightier than the sword".split()
s2 = "the sword is mightier than the pen".split()

keys = list(set(s1 + s2))

counter_s1 = Counter(s1)
counter_s2 = Counter(s2)

bow_s1 = np.array([counter_s1[key] if counter_s1[key] else 0 for key in keys])
bow_s2 = np.array([counter_s2[key] if counter_s2[key] else 0 for key in keys])

print("bag-of-words of s1:")
print(bow_s1)
print("bag-of-words of s2:")
print(bow_s2)

bag-of-words of s1:
[1 1 1 1 1 2]
bag-of-words of s2:
[1 1 1 1 1 2]


In [2]:
sim_bow = bow_s1.dot(bow_s2) / (norm(bow_s1) * norm(bow_s2))
print("sim_cos(s1, s2) of bag-of-words:")
print(sim_bow)

sim_cos(s1, s2) of bag-of-words:
1.0


In [3]:
keys = list(set(list(zip(s1, s1[1:])) + list(zip(s2, s2[1:]))))

counter_s1 = Counter(list(zip(s1, s1[1:])))
counter_s2 = Counter(list(zip(s2, s2[1:])))

bobi_s1 = np.array([counter_s1[key] if counter_s1[key] else 0 for key in keys])
bobi_s2 = np.array([counter_s2[key] if counter_s2[key] else 0 for key in keys])

print("bag-of-bigrams of s1:")
print(bobi_s1)
print("bag-of-bigrams of s2:")
print(bobi_s2)

bag-of-bigrams of s1:
[1 1 1 1 1 0 1]
bag-of-bigrams of s2:
[1 1 1 1 0 1 1]


In [4]:
sim_bobi = bobi_s1.dot(bobi_s2) / (norm(bobi_s1) * norm(bobi_s2))
print("sim_cos(s1, s2) of bag-of-bigrams:")
print(sim_bobi)

sim_cos(s1, s2) of bag-of-bigrams:
0.833333333333


### 例題2.6

In [5]:
from functools import reduce
import re

s1 = re.sub(r'[\.,?]', "", "A cat sat on the mat.").lower().split(" ")
s2 = re.sub(r'[\.,?]', "", "Cats are sitting on the mat.").lower().split(" ")

def vectorize_with_bow(*args):
    keys = sorted(set(reduce(lambda x, y: x + y, args)))
    counters = [Counter(arg) for arg in args]
    return [np.array([c[key] if c[key] else 0 for key in keys]) for c in counters]

def sim_cos(lhs, rhs):
    return lhs.dot(rhs) / norm(lhs) / norm(rhs)
    
bow_s1, bow_s2 = vectorize_with_bow(s1, s2)
print("bow of s1:")
print(bow_s1)
print("bow of s2:")
print(bow_s2)
print("sim_cos(s1, s2):")
print(sim_cos(bow_s1, bow_s2))

bow of s1:
[1 0 1 0 1 1 1 0 1]
bow of s2:
[0 1 0 1 1 1 0 1 1]
sim_cos(s1, s2):
0.5


In [6]:
lemmatizated_s1 = "a cat sit on the mat".split(" ")
lemmatizated_s2 = "cat are sit on the mat.".split(" ")
bow_s1, bow_s2 = vectorize_with_bow(lemmatizated_s1, lemmatizated_s2)
print("bow of s1:")
print(bow_s1)
print("bow of s2:")
print(bow_s2)
print("sim_cos(s1, s2):")
print(sim_cos(bow_s1, bow_s2))

bow of s1:
[1 0 1 1 0 1 1 1]
bow of s2:
[0 1 1 0 1 1 1 1]
sim_cos(s1, s2):
0.666666666667


### 章末問題1

In [10]:
w = "tattarrattat"
print("character unigram: {%s}" % ", ".join(list(w)))
print("vector by unigram: %s" % vectorize_with_bow(w))
print("character bigram: {%s}" % ", ".join(["".join(x) for x in list(zip(list(w), list(w)[1:]))]))
print("vector by bigram: %s" % vectorize_with_bow(list(zip(list(w), list(w)[1:]))))
print("character trigram: {%s}" % ", ".join(["".join(x) for x in list(zip(list(w), list(w)[1:], list(w)[2:]))]))
print("vector by trigram: %s" % vectorize_with_bow(list(zip(list(w), list(w)[1:], list(w)[2:]))))

character unigram: {t, a, t, t, a, r, r, a, t, t, a, t}
vector by unigram: [array([4, 2, 6])]
character bigram: {ta, at, tt, ta, ar, rr, ra, at, tt, ta, at}
vector by bigram: [array([1, 3, 1, 1, 3, 2])]
character trigram: {tat, att, tta, tar, arr, rra, rat, att, tta, tat}
vector by trigram: [array([1, 2, 1, 1, 1, 2, 2])]


### 章末問題2

In [15]:
s = "A cat sat on the mat."
lemmatizated_s = 'a cat sit on the mat'.split(" ")
stopword = ['a', 'the', 'on', 'in', 'of']
lemmatizated_and_stopword_removed = [w for w in lemmatizated_s if w not in stopword]
vectorize_with_bow(lemmatizated_and_stopword_removed)

[array([1, 1, 1])]

### 章末問題3

In [37]:
from itertools import chain

s = re.sub(r'[\.,?]', '', "I had a supercalifragilisticexpialidocious time with the friends.").lower().split()
target = "supercalifragilisticexpialidocious"
target_idx = 3
keys = list(chain.from_iterable([((w, 2), (w, 1), (w, -1), (w, -2), )
                                 for w in s if w != target]))
[1 if s[target_idx - key[1]] is key[0] else 0 for key in keys]

[0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]