In [1]:
import json
from sklearn.externals import joblib
import os
import re
import numpy as np

In [10]:
docsUrlsPath = '../data/urls.docs.txt'
textdataPath = '../data/textdata'
templateJson = '{:d}.json'
queriesDocsPath = '../data/queries.docs.txt'
queriesPath = '../data/queries.numerate.txt'

pat = re.compile(r'\d+')
procNumbs = set(map(lambda x: int(pat.search(x).group(0)), os.listdir(textdataPath)))

queriesDict = {}
with open(queriesPath) as inputFile:
    for line in inputFile:
        number, query = line.strip().split('\t')
        queriesDict[number] = query
        
queriesDocsDict = json.load(open(queriesDocsPath))

docsUrlsDict = {}
with open(docsUrlsPath) as inputFile:
    for line in inputFile:
        number, url, path = line.strip().split('\t')
        docsUrlsDict[number] = (url, path)

In [3]:
class TfIdfBank:
    tfIdfNumberPagesPath = '../data/tfIdfNumberPages'
    tfIdfFeaturesPath = '../data/tfidfFeatutes'
    
    def __init__(self, tfIdfMatrix):
        numberPages = json.load(open(TfIdfBank.tfIdfNumberPagesPath))
        features = json.load(open(TfIdfBank.tfIdfFeaturesPath))
        
        self._indexPages = np.full(27000, -1, dtype=np.int)
        self._indexPages[numberPages] = range(len(numberPages))
        self._tfIdfMatrix = tfIdfMatrix#joblib.load(TfIdfFeatures.tfIdfPagesPath)        
        self._indexFeatures = dict(map(lambda x: (x[1], x[0]), enumerate(features)))
        
    def tfidf(self, word, numberPage):
        indexPage = self._indexPages[numberPage]
        if indexPage == -1:
            return None
        indexFeature = self._indexFeatures.get(word.strip().lower(), -1)
        if indexFeature == -1:
            return 0
        return self._tfIdfMatrix[indexPage, indexFeature]    

In [12]:
titles = []
keywords = []
descriptions = []
for numb in procNumbs:
    filename = templateJson.format(numb)
    path = '{:s}/{:s}'.format(textdataPath, filename)
    pageDict = json.load(open(path))
    titles.append(pageDict["title"])
    keywords.append(pageDict["keywords"])
    descriptions.append(pageDict["description"])

In [14]:
json.dump(titles, open('../data/subdata/titles', 'w'))
json.dump(keywords, open('../data/subdata/keywords', 'w'))
json.dump(descriptions, open('../data/subdata/description', 'w'))
json.dump(procNumbs, open('../data/subdata/procNumbs', 'w'))

In [4]:
titles = json.load(open('../data/subdata/titles'))
keywords = json.load(open('../data/subdata/keywords'))
descriptions = json.load(open('../data/subdata/description'))

In [12]:
tfIdfVectorizerPath = '../data/models/tfIdfVectorizer.pkl'
tfIdfVect = joblib.load(tfIdfVectorizerPath)

In [13]:
tfidfDicts = [{}, {}, {}]
for i, data in enumerate([titles, keywords, descriptions]):
    bank = TfIdfBank(tfIdfVect.transform(data))
    for numb, query in queriesDict.items():
        numbsDocs = queriesDocsDict[numb]
        words = query.strip().split()
        tfidfs = []
        for numbDoc in numbsDocs:
            if bank._indexPages[numbDoc] == -1:
                tfidfs.append(None)
            else:
                tfidfs.append(sum(list(map(lambda x: bank.tfidf(x, numbDoc), words))))
        tfidfDicts[i][numb] = tfidfs        

In [15]:
titlePath = '../data/features/tfidfTitle'
keywordsPath = '../data/features/tfidfKeywords'
descPath = '../data/features/tfidfDesc'

json.dump(tfidfDicts[0], open(titlePath, 'w'))
json.dump(tfidfDicts[1], open(keywordsPath, 'w'))
json.dump(tfidfDicts[2], open(descPath, 'w'))