-
Notifications
You must be signed in to change notification settings - Fork 93
/
multi_multi_kernel_nb.py
executable file
·117 lines (91 loc) · 3.67 KB
/
multi_multi_kernel_nb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from __future__ import division
import numpy as np
from collections import Counter, defaultdict
class MMGKNB(object):
def __init__(self, w2v, alpha=1, sigma=1):
self.alpha = alpha
self.sigma = sigma
self.w2v = w2v
self.vocab = None
self.priors = {}
self.class_word_counts = defaultdict(Counter)
self.class_totals = defaultdict(lambda: 0)
self.class2vecs = {}
self.cache = {}
def vec_loglhood(self, class_, w):
if w not in self.w2v:
return 0
key = (class_, w)
if key in self.cache:
return self.cache[key]
x = self.w2v[w]
prob = np.exp(-((self.class2vecs[class_] - x)**2/(2 * self.sigma**2)).sum(axis=1)).sum()
ret = np.log(prob)
self.cache[key] = ret
return ret
def predict_one(self, x):
scores = {}
for class_, prior in self.priors.items():
log_prob = np.log(prior)
for w in x:
log_prob += self.vec_loglhood(class_, w)
scores[class_] = log_prob
return max(scores.items(), key=lambda z: z[1])[0]
def predict(self, X):
return [self.predict_one(x) for x in X]
def fit(self, X, y):
self.priors = dict((class_, count/len(y)) for class_, count in Counter(y).items())
class2vecs = defaultdict(list)
for x, class_ in zip(X, y):
self.class_word_counts[class_].update(x)
self.class_totals[class_] += len(x)
vectors = [self.w2v[w] for w in x if w in self.w2v]
class2vecs[class_].extend(vectors)
self.class2vecs = class2vecs
for class_ in class2vecs:
class2vecs[class_] = np.array(class2vecs[class_])
self.vocab = set(t for x in X for t in x)
class OptimisedMMGKNB(object):
def __init__(self, w2v, alpha=1, sigma=1):
self.w2v = w2v
self.alpha = alpha
self.sigma = sigma
self.vocab = None
self.priors = {}
self.class_word_counts = defaultdict(Counter)
self.class_totals = defaultdict(lambda: 0)
self.class2vecs = {}
def vec_loglhood(self, class_, x):
prob = 0
# for vec in self.class2vecs[class_]:
# prob += np.exp(-(x - vec).T.dot(x - vec)/(2 * self.sigma**2))
prob = np.exp(-((self.class2vecs[class_] - x)**2/(2 * self.sigma**2)).sum(axis=1)).sum()
return np.log(prob)
def word_loglhood(self, class_, word):
lhood = (self.class_word_counts[class_].get(word, 0) + self.alpha) /\
(self.class_totals[class_] + len(self.vocab) * self.alpha)
return np.log(lhood)
def predict_one(self, x):
scores = {}
for class_, prior in self.priors.items():
log_prob = np.log(prior)
for vec in x['vectors']:
log_prob += self.vec_loglhood(class_, vec)
for w in x['words']:
log_prob += self.word_loglhood(class_, w)
scores[class_] = log_prob
return max(scores.items(), key=lambda x: x[1])[0]
def predict(self, X):
return [self.predict_one(x) for x in X]
def fit(self, X, y):
self.priors = dict((class_, count/len(y)) for class_, count in Counter(y).items())
class2vecs = defaultdict(list)
for x, class_ in zip(X, y):
self.class_word_counts[class_].update(x['words'])
self.class_totals[class_] += len(x)
vectors = x['vectors']
class2vecs[class_].extend(vectors)
self.class2vecs = class2vecs
for class_ in class2vecs:
class2vecs[class_] = np.array(class2vecs[class_])
self.vocab = set(t for x in X for t in x['words'])