In [None]:
import json
import math
import re
from nltk.tokenize import word_tokenize


In [None]:

class TermObject:

    def create(self, json):
        return TermObject(self, json=json)

    def create(self, term, count):
        return TermObject(self, term=term, count=count)

    def create(self, term):
        return TermObject(term=term, count=1, json=None)

    def __init__(self, *args, **kwargs):
        if kwargs['count'] is None:
            count = 1
        else:
            count = kwargs['count']

        if kwargs['json'] is None:
            term = kwargs['term']

            self.term = term
            self.count = count
            self.tf = 0
            self.idf = 0.0
            self.tfidfWithLog = 0.0
            self.pIdf = 0.0
            self.tfidf = 0.0
            self.tfLog = 0.0
            self.countByContents = 0
            self.RSJweight = 0.0
            self.nMaximumTF = 0.0
            self.smart = 0.0
            self.inquery = 0.0
            self.nCosineWithTF = 0.0
            self.nCosineWithTFLog = 0.0
            self.pCosine = 0.0
            self.pByte = 0.0
            self.pUnique = 0.0
            self.aidf = 0.0
            self.BM25 = 0.0
            self.nBM25 = 0.0
            self.qBM25 = 0.0
            self.TREC8 = 0.0
            self.WBR99 = 0.0

        else:
            self.term = json['term']
            self.count = json['count']
            self.countByContents = json['countbycontents']
            self.tf = json['tf']
            self.tfLog = json['tflog']
            self.idf = json['idf']
            self.pIdf = json['pidf']
            self.tfidf = json['tfidf']
            self.tfidfWithLog = json['tfidfwithlog']
            self.RSJweight = json['rsjweight']
            self.nCosineWithTF = json['ncosinewithtf']
            self.nCosineWithTFLog = json['ncosinewithtflog']
            self.nMaximumTF = json['nmaximumtf']
            self.smart = json['smart']
            self.inquery = json['inquery']
            self.pCosine = json['pcosine']
            self.pByte = json['pbyte']
            self.pUnique = json['punique']

            self.aidf = 0.0
            self.BM25 = 0.0
            self.nBM25 = 0.0
            self.qBM25 = 0.0
            self.TREC8 = 0.0
            self.WBR99 = 0.0

            if 'aidf' in json:
                self.aidf = json['aidf']
            if 'bm25' in json:
                self.BM25 = json['bm25']
            if 'nbm25' in json:
                self.nBM25 = json['nbm25']
            if 'qbm25' in json:
                self.qBM25 = json['qbm25']
            if 'trec8' in json:
                self.TREC8 = json['trec8']
            if 'wbr99' in json:
                self.WBR99 = json['wbr99']

    def toString(self):
        return '[' + self.term + ' - ' + self.tf + ']'

In [None]:
class TermManager:

    def initializeByRow(self, row):

        self.totalTerms = row['totalterms']
        self.avgTerms = row['avgterms']
        self.totalBytes = row['totalbytes']
        self.avgBytes = row['avgbytes']
        self.totalTF = row['totaltf']
        self.maxTF = row['maxtf']
        self.avgTF = row['avgtf']
        self.totalCosine = row['totalcosine']
        self.avgCosine = row['avgcosine']

        self.termObjectSize = 0
        self.contentsSize = 0

        if 'termobjectsize' in row:
            self.termObjectSize = row['termobjectsize']
        if 'contentssize' in row:
            self.contentsSize = row['contentssize']

    def create(self, date,  type,  contentsList):
        return TermManager(date, type, contentsList, [])

    def __init__(self, date, type, contentsList, termObjectList):
        self.date = date
        self.type = type
        self.contentsList = contentsList
        self.termObjectList = termObjectList

        self.totalTerms = 0
        self.avgTerms = 0.0
        self.totalTF = 0
        self.maxTF = 0
        self.avgTF = 0.0
        self.totalCosine = 0

        bytes = 0

        for content in contentsList:
            bytes += len(content.encode('utf8'))

        self.totalBytes = bytes
        self.contentsSize = len(contentsList)

    def addTermObject(self, json=None, termList=None):
        if termList is None:
            e = TermObject.create(self, json)
            self.termObjectList.append(e)
        else:
            self.totalTerms += len(termList)

            for term in termList:
                e = TermObject.create(self=self, term=term)
                if not termList:
                    self.termObjectList.append(e)
                else:
                    isContain = False
                    for termObject in self.termObjectList:
                        if termObject.term == term:
                            termObject.count += 1
                            isContain  = True
                            break;

                    if not isContain:
                        self.termObjectList.append(e)

    def calculate(self):
        self.termObjectSize = len(self.termObjectList)

        for obj in self.termObjectList:

            TF = obj.count / self.contentsSize
            obj.tf = TF

            self.totalTF += TF

            if self.maxTF < TF:
                self.maxTF = TF

            obj.tfLog = (1 + math.log(obj.tf))
            term = obj.term
            countByContents = 0
            for content in self.contentsList:
                if term in content:
                    countByContents += 1

            obj.countByContents = countByContents

            if obj.countByContents * 2 > self.contentsSize:
                continue

            obj.idf = math.log(self.contentsSize / obj.countByContents)
            obj.aidf = math.log((self.contentsSize / obj.countByContents) + 1.0)

            obj.tfidf = obj.tf * obj.idf
            obj.tfidfWithLog = obj.tfLog * obj.idf

            obj.RSJweight = math.log((self.contentsSize - obj.countByContents + 0.5) / (obj.countByContents + 0.5))

            obj.pIdf = math.log((self.contentsSize - obj.countByContents + 1.0) / obj.countByContents)

            obj.inquery = math.log((self.contentsSize + 0.5) / obj.countByContents) / math.log(self.contentsSize + 1.0)

            obj.nCosineWithTF = 1.0 / math.sqrt(obj.tfidf * obj.tfidf)

            cosine = 1.0 / math.sqrt(obj.tfidfWithLog * obj.tfidfWithLog)
            obj.nCosineWithTFLog = cosine
            self.totalCosine += cosine

        self.avgTerms = self.totalTerms / self.contentsSize
        self.avgTF = self.totalTF / self.termObjectSize
        self.avgCosine = self.totalCosine / self.termObjectSize
        self.avgBytes = self.totalBytes / self.contentsSize

        cosineSlope = 0.7
        byteSlope = 0.3
        uniqueSlope = 0.25

        for obj in self.termObjectList:

            if obj.countByContents * 2 >= self.contentsSize:
                obj.WBR99 = 0.0
                continue

            obj.nMaximumTF = 0.5 + (0.5 + obj.tf) / self.maxTF

            smart = obj.tfLog / (1.0 + math.log(self.avgTF))
            obj.smart = smart

            obj.pCosine = smart / ((1 - cosineSlope) + cosineSlope * self.avgCosine / obj.nCosineWithTFLog)

            obj.pByte = smart / ((1 - byteSlope) * self.avgBytes + byteSlope * self.totalBytes)

            obj.pUnique = smart / (((1 - uniqueSlope) * self.avgTerms) + (uniqueSlope * self.termObjectSize))

            obj.BM25 = (2.2 * obj.tf / ((((0.25) + 0.75 * self.totalBytes / self.avgBytes) + self.totalBytes) * 1.2) + obj.tf)

            obj.nBM25 = 1.0 / ((1.2 * (0.25) + 0.75 *  self.totalBytes / self.avgBytes) + obj.tf)

            obj.qBM25 = (1000 + 1) * obj.tf / (1000 + obj.tf)

            t1 = obj.tf
            t2 = obj.tfLog
            t3 = obj.nMaximumTF
            t4 = obj.smart
            t5 = obj.BM25
            t6 = obj.idf
            t7 = obj.aidf
            t8 = obj.RSJweight
            t10 = obj.pIdf
            t11 = obj.inquery
            t12 = obj.nCosineWithTF
            t16 = obj.pByte
            t18 = obj.nBM25
            t19 = obj.qBM25

            tREC8 = (((math.log(t8)) * (t5 + t7))
                * ((((t19 + t5) * (t7 + t6)) + ((t6 + t2) * (t16 * t18))) + (t7 / t19)))

            wBR99 = (((99.09 + t11) + (((t7 * t10) * (t5 * (((t7 * t10) + (t8 + t10)) * (t12 * t1))))
                + ((t7 * t10) * (t5 * (((t2 * t4) + (t8 + t10)) * (t12 * t1))))))
                + ((t12 * t1) + ((t7 * t10) * (t5 * (((t8 / t3) + (t8 + t10)) * (t12 * t1))))))

            obj.TREC8 = tREC8
            obj.WBR99 = wBR99