In [8]:
import numpy as np

from collections import Counter
from math import exp, log

from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

from os import listdir

In [9]:
class Bayes(BaseEstimator):
	def __init__(self, k=2, lambdas=[1, 1], alpha=1, n=1):
		self.k = k
		self.lambdas = lambdas
		self.alpha = alpha
		self.n = n

		self.x_count = set()
		self.c_count = Counter()
		self.p = []

	
	def fit(self, x_train, y_train):
		x_train = x_train[self.n] 
		data = [Counter() for _ in range(self.k)]
		for xs, c in zip(x_train, y_train):
			for x in set(xs):
				self.x_count.add(x)
				data[c][x] += 1
			self.c_count[c] += 1 
		
		self.p   = [{x:         (data[c][x] + self.alpha) / (self.c_count[c] + self.alpha * 2)  for x in self.x_count} for c in range(self.k)]
		self.pos = [{x: log(    (data[c][x] + self.alpha) / (self.c_count[c] + self.alpha * 2)) for x in self.x_count} for c in range(self.k)]
		self.neg = [{x: log(1 - (data[c][x] + self.alpha) / (self.c_count[c] + self.alpha * 2)) for x in self.x_count} for c in range(self.k)]


	def calc_prob_of_xs_on_cond_of_c(self, xs, c):
		if self.c_count[c] == 0:
			return 0
		res = log(self.c_count[c] / sum(self.c_count.values()))
		for x in self.p[c]:
			if x in xs:
				res += self.pos[c][x]
			else:
				res += self.neg[c][x]
		return exp(res) * self.lambdas[c]


	def predict_one(self, xs):
		xs = set(xs)
		denum = sum([self.calc_prob_of_xs_on_cond_of_c(xs, c) for c in range(self.k)])
		res = []
		for c in range(self.k):
			num = self.calc_prob_of_xs_on_cond_of_c(xs, c)
			ans = num / denum if denum != 0 else 1
			res.append(ans if num > 0 else 0)
		return res


	def predict(self, xs_test):
		return [c.index(max(c)) for c in map(self.predict_one, xs_test)]

In [10]:
def ngrams(text, n):
    return [' '.join(map(str, text[i:i+n])) for i in range(len(text) - n + 1)]

ngrams(['1', '2', '3', '4'], 2)

['1 2', '2 3', '3 4']

In [11]:
x, y = [], []

for i in range(1, 10 + 1):
    dir = 'part' + str(i)
    for file_name in os.listdir(dir):
        y.append(1 if 'spmsg' in file_name else 0)
        with open(dir + '/' + file_name) as f:
            subject = f.readline().split()[1:]
            f.readline()
            text = f.readline().split()
        x.append(subject + text)

In [12]:
def accuracy_wo_fp(true, predicted):
    assert(len(true) == len(predicted))
    num = 0
    for t, p in zip(true, predicted):
        num += t == p
        if t == 0 and p != t:
            return 0
    return num / len(true)

accuracy_wo_fp_score = make_scorer(accuracy_wo_fp, greater_is_better=True)

In [13]:
%%time

param_grid = {'alpha':   [1, 1e-1, 1e-2, 1e-2, 1e-4, 0]
             ,'lambdas': [(1, l) for l in [1e-3, 1e4, 1e-5, 1e-6]]}

# clf = GridSearchCV(Bayes(), param_grid, cv=3, n_jobs=-1, scoring=accuracy_wo_fp_score)
# clf.fit(x, y)
# clf.best_score_, clf.best_params_

Wall time: 0 ns


In [7]:
%%time

x2 = [ngrams(x, 2) for m in x]

clf = GridSearchCV(Bayes(), param_grid, cv=3, n_jobs=-1, scoring=accuracy_wo_fp_score)
clf.fit(x2, y)
clf.best_score_, clf.best_params_

NameError: name 'param_grid' is not defined