In [23]:
import numpy as np

def textParser(text):
    
    import re
    regEx = re.compile(r'[^a-zA-Z]|\d')  
    words = regEx.split(text)
    words = [word.lower() for word in words if len(word) > 0]
    return words

def loadSMSData(fileName):
    
    f = open(fileName)
    classCategory = []  
    smsWords = []
    for line in f.readlines():
        linedatas = line.strip().split('\t')
        if linedatas[0] == 'lie':
            classCategory.append(0)
        elif linedatas[0] == 'true':
            classCategory.append(1)
        words = textParser(linedatas[1])
        smsWords.append(words)
    return smsWords, classCategory


def createVocabularyList(smsWords):
    
    vocabularySet = set([])
    for words in smsWords:
        vocabularySet = vocabularySet | set(words)
    vocabularyList = list(vocabularySet)
    return vocabularyList

def getVocabularyList(fileName):
    fr = open(fileName)
    vocabularyList = fr.readline().strip().split('\t')
    fr.close()
    return vocabularyList

def setOfWordsToVecTor(vocabularyList, smsWords):
    
    vocabMarked = [0] * len(vocabularyList)
    for smsWord in smsWords:
        if smsWord in vocabularyList:
            vocabMarked[vocabularyList.index(smsWord)] += 1
    return vocabMarked

def setOfWordsListToVecTor(vocabularyList, smsWordsList):
    
    vocabMarkedList = []
    for i in range(len(smsWordsList)):
        vocabMarked = setOfWordsToVecTor(vocabularyList, smsWordsList[i])
        vocabMarkedList.append(vocabMarked)
    return vocabMarkedList

def trainingNaiveBayes(trainMarkedWords, trainCategory):

    numTrainDoc = len(trainMarkedWords)
    numWords = len(trainMarkedWords[0])
    pSpam = sum(trainCategory) / float(numTrainDoc)
    wordsInlieNum = np.ones(numWords)
    wordsInTrueNum = np.ones(numWords)
    lieWordsNum = 2.0
    TrueWordsNum = 2.0
    for i in range(0, numTrainDoc):
        if trainCategory[i] == 1:  
            WordsInlieNum += trainMarkedWords[i]
            lieWordsNum += sum(trainMarkedWords[i])  
        else:
            wordsTrueNum += trainMarkedWords[i]
            TrueWordsNum += sum(trainMarkedWords[i])

    pWordslie = np.log(WordsInlieNum / lieWordsNum)
    pWordsTrue = np.log(wordsInTrueNum / TrueWordsNum)
    return pWordsTrue, pWordslie, pLie

def getTrainedModelInfo():
    
    vocabularyList = getVocabularyList('C:/Users/hp 850/Desktop/vocabularyList.txt')
    pWordsTrue = np.loadtxt('C:/Users/hp 850/Desktop/pWordsTrue.txt', delimiter='\t')
    pWordslie = np.loadtxt('C:/Users/hp 850/Desktop/pWordslie.txt', delimiter='\t')
    fr = open('C:/Users/hp 850/Desktop/pLie.txt')
    pLie = float(fr.readline().strip())
    fr.close()
    return vocabularyList, pWordsTrue, pWordslie, pLie

def classify(vocabularyList, pWordsTrue, pWordslie, pLie, testWords):
    
    testWordsCount = setOfWordsToVecTor(vocabularyList, testWords)
    testWordsMarkedArray = np.array(testWordsCount)
    p1 = sum(testWordsMarkedArray * pWordsSpamicity) + np.log(pLie)
    p0 = sum(testWordsMarkedArray * pWordsHealthy) + np.log(1 - pLie)
    if p1 > p0:
        return 1
    else:
        return 0

training

In [3]:
!pip  install  bayes

Collecting bayes
  Downloading bayes-0.1.1.tar.gz (3.7 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: bayes
  Building wheel for bayes (setup.py): started
  Building wheel for bayes (setup.py): finished with status 'done'
  Created wheel for bayes: filename=bayes-0.1.1-py3-none-any.whl size=5532 sha256=8c86f08c1062c945fa1a90fb123479772b5c37a9c6b2be218b85e59d5dd0eb61
  Stored in directory: c:\users\hp 850\appdata\local\pip\cache\wheels\8b\e1\33\bd265d7768a8f78d2ca909f6a70c9bfb334623942e3709cf3b
Successfully built bayes
Installing collected packages: bayes
Successfully installed bayes-0.1.1


In [None]:
import numpy as np
import SimpleNavieBayes.NavieBayes as naiveBayes

filename = "C:/Users/hp 850/Desktop/traning.txt"
smsWords, classLables = naiveBayes.loadSMSData(filename)
vocabularyList = naiveBayes.createVocabularyList(smsWords)
print ("Generate corpus!")
trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords)
print ("Data marking is complete!")
trainMarkedWords = np.array(trainMarkedWords)
print ("The data is converted into a matrix!")
pWordslie, pWordsTrue, pLie = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)
print ('pLie:', pLie)
fpLie = open('C:/Users/hp 850/Desktop/pLie.txt', 'w')
Lie = pLie.__str__()
fpLie.write(Lie)
fpLie.close()
fw = open('C:/Users/hp 850/Desktop/vocabularyList.txt', 'w')
for i in range(len(vocabularyList)):
    fw.write(vocabularyList[i] + '\t')
fw.flush()
fw.close()
np.savetxt('C:/Users/hp 850/Desktop/vocabularyList.txt', pWordsSpamicity, delimiter='\t')
np.savetxt('C:/Users/hp 850/Desktop/pWordsTrue.txt', pWordsHealthy, delimiter='\t')

testing

In [None]:
import SimpleNavieBayes.NavieBayes as naiveBayes
import random
import numpy as np
def simpleTest():
    vocabularyList, pWordslie, pWordsTrue, pLie = \
        naiveBayes.getTrainedModelInfo()
    filename = 'C:/Users/hp 850/Desktop/testing.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)
    smsType = naiveBayes.classify(vocabularyList, pWordsTrue, pWordslie, pLie, smsWords[0])
    print (smsType)

def testClassifyErrorRate():
    filename = 'C:/Users/hp 850/Desktop/traning.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)
    testWords = []
    testWordsType = []
    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    vocabularyList = naiveBayes.createVocabularyList(smsWords)
    print ("Generate corpus!")
    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords)
    print ("Data marking is complete!")
    trainMarkedWords = np.array(trainMarkedWords)
    print ("The data is converted into a matrix!")
    pWordsTrue, pWordslie, pLie = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)
    errorCount = 0.0
    for i in range(testCount):
        smsType = naiveBayes.classify(vocabularyList, pWordsTrue, pWordslie, pLie, testWords[i])
        print ('Forecast category:', smsType, 'Actual category:', testWordsType[i])
        if smsType != testWordsType[i]:
            errorCount += 1
    print ('Number of errors:', errorCount, 'Error rate:', errorCount / testCount)

if __name__ == '__main__':
    testClassifyErrorRate()