In [2]:
import re
import math

In [3]:
def getwords(doc):
    """分裂字符串用"""
    splitter = re.compile('\\W*')
    words = [val.lower() for val in splitter.split(doc) if len(val) > 2 and len(val) < 20]
    return {key: 1 for  key in words}

In [4]:
class Classifier(object):
    
    def __init__(self, getfeatures, filename=None):
        # 统计特征/分类组合的数量
        self.fc = {}
        # 统计每个分类中的文档数量
        self.cc = {}
        self.getfeatures = getfeatures
        
        self.thresholds = {}
    
    def incf(self, f, cat):
        """增加对特征/分类组合的计数值"""
        self.fc.setdefault(f, {})
        self.fc[f].setdefault(cat, 0)
        self.fc[f][cat] += 1
    
    def incc(self, cat):
        """增加对某一分类的计数值"""
        self.cc.setdefault(cat, 0)
        self.cc[cat] += 1
    
    def fcount(self, f, cat):
        """某一特征出现于某一分类中的次数"""
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0
    
    def catcount(self, cat):
        """属于某一分类的内容项数量"""
        if cat in self.cc:
            return float(self.cc[cat])
        return 0.0
    
    def totalcount(self):
        """所有内容项的数量"""
        return sum(self.cc.values())
    
    def categories(self):
        """所有分类的列表"""
        return self.cc.keys()
    
    def train(self, item, cat):
        """增加训练数据"""
        features = self.getfeatures(item)
        # 针对该分类为每个特征增加计数值
        for f in features:
            self.incf(f, cat)
        # 增加针对该分类的计数值
        self.incc(cat)

    def fprob(self, f, cat):
        """计算特征在分类下的概率"""
        if self.catcount(cat) == 0: return 0
        return self.fcount(f, cat) / self.catcount(cat)
    
    def weightedprob(self, f, cat, prf, weight=1.0, ap=0.5):
        """计算加权概率"""
        basicprob=prf(f, cat)
        totals = sum([self.fcount(f,c) for c in self.categories()])
        bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
        return bp
    
    def setthreshold(self, cat, t):
        """设置阀值"""
        self.thresholds[cat] = t
    
    def getthreshold(self, cat):
        """获取阀值"""
        if cat not in self.thresholds: return 1.0
        return self.thresholds[cat]

    def classify(self, item, default=None):
        """获取特征的在不同分类下的最大概率"""
        probs = {}
        max = 0.0
        for cat in self.categories():
            probs[cat] = self.prob(item, cat)
            if probs[cat] > max:
                max= probs[cat]
                best = cat
        
        for cat in probs:
            if cat == best: continue
            if probs[cat] * self.getthreshold(best) > probs[best]: return default, max
        
        return best, max

In [5]:
cl = Classifier(getwords)

In [6]:
# cl.train("the quick brown fox jumps over the lazy dog", "good")

In [7]:
# cl.train("make quick money in the online casino", "bad")

In [8]:
# cl.fcount("quick", "good")

In [9]:
# cl.fcount("quick", "bad")

In [10]:
def sampletrain(cl):
    cl.train("Nobody owns the water.", "good")
    cl.train("the quick rabbit jumps fences", "good")
    cl.train("buy pharmaceuticals now", "bad")
    cl.train("make quick money at the online casino", "bad")
    cl.train("the quick brown fox jumps", "good")

In [11]:
sampletrain(cl)

In [12]:
cl.fprob("quick", "good")

0.6666666666666666

In [13]:
cl.fc, cl.cc

({'brown': {'good': 1},
  'buy': {'bad': 1},
  'casino': {'bad': 1},
  'fences': {'good': 1},
  'fox': {'good': 1},
  'jumps': {'good': 2},
  'make': {'bad': 1},
  'money': {'bad': 1},
  'nobody': {'good': 1},
  'now': {'bad': 1},
  'online': {'bad': 1},
  'owns': {'good': 1},
  'pharmaceuticals': {'bad': 1},
  'quick': {'bad': 1, 'good': 2},
  'rabbit': {'good': 1},
  'the': {'bad': 1, 'good': 3},
  'water': {'good': 1}},
 {'bad': 2, 'good': 3})

In [14]:
cl.fprob("money", "good"), cl.fprob("money", "bad")

(0.0, 0.5)

In [15]:
cl.weightedprob("money", "good", cl.fprob)

0.25

In [16]:
sampletrain(cl)

In [17]:
cl.weightedprob("money", "good", cl.fprob)

0.16666666666666666

In [18]:
############################################################

In [19]:
class Naivebayes(Classifier):
    def docprob(self, item, cat):
        """计算所有特征概率相乘的整体概率"""
        features = self.getfeatures(item)
        p = 1
        for f in features: p *= self.weightedprob(f, cat, self.fprob)
        return p
    
    def prob(self, item, cat):
        """计算分类的概率"""
        catprob = self.catcount(cat) / self.totalcount()
        docprob = self.docprob(item, cat)
        return docprob * catprob

In [20]:
nb = Naivebayes(getwords)

In [21]:
sampletrain(nb)

In [22]:
nb.prob("quick rabbit", "good")

0.15624999999999997

In [23]:
nb.prob("quick rabbit", "bad")

0.05

In [24]:
nb.classify("quick rabbit", default="unknown")

('good', 0.15624999999999997)

In [25]:
nb.classify("quick money", default="unknown")

('bad', 0.1)

In [26]:
nb.setthreshold("bad", 3.0)

In [27]:
nb.classify("quick rabbit", default="unknown")

('good', 0.15624999999999997)

In [28]:
nb.classify("quick money", default="unknown")

('unknown', 0.1)

In [30]:
###########################################################

In [31]:
# 1. 写淘宝联盟数据抓取爬虫 抓 分类 + 商品名
# 2. 准备jieba分词处理商品名的切分
# 3. web展现并人工筛选一次降低错误率
# 4. 导入数据的标题输出分类，判断准确率
# 5. 数据的web可视化功能