In [1]:
import json
from sklearn.externals import joblib
import os
import re
import numpy as np
from random import randint
from pandas import DataFrame

In [2]:
dataPath = '../data'
tfIdfPagesPath = '../data/features/tfidfPage'

docsUrlsPath = '../data/urls.docs.txt'
textdataPath = '../data/textdata'
templateJson = '{:d}.json'
samplePath = '../data/sample.csv'

queriesDocsPath = '../data/queries.docs.txt'
queriesPath = '../data/queries.numerate.txt'

In [3]:
pat = re.compile(r'\d+')
procNumbs = set(map(lambda x: int(pat.search(x).group(0)), os.listdir(textdataPath)))

queriesDict = {}
with open(queriesPath) as inputFile:
    for line in inputFile:
        number, query = line.strip().split('\t')
        queriesDict[number] = query
        
queriesDocsDict = json.load(open(queriesDocsPath))

docsUrlsDict = {}
with open(docsUrlsPath) as inputFile:
    for line in inputFile:
        number, url, path = line.strip().split('\t')
        docsUrlsDict[number] = (url, path)

# Ранжирование

In [4]:
class Rank:
    def __init__(self):
        self._ranks = {}
    def setRanks(self, numbQuery, listPages):
        self._ranks[numbQuery] = listPages
    def save(self, fileOutput):
        rankList = ([[q, d] for q, docs in sorted(self._ranks.items(), key=lambda x: int(x[0])) for d in docs])
        dataFrame = DataFrame.from_records(rankList, columns=['QueryId', 'DocumentId'])
        dataFrame.to_csv(fileOutput, index=False)        

** Ранжирование по tfidf **

In [6]:
tfIdfFullText = '../data/features/tfidfFullText'
titlePath = '../data/features/tfidfTitle'
keywordsPath = '../data/features/tfidfKeywords'
descPath = '../data/features/tfidfDesc'

#tfIdfFullPagesPath = '../data/tfidfFullPages'
tfidfPages = json.load(open(tfIdfFullText))
titleTfidf = json.load(open(titlePath))
keywordsTfidf = json.load(open(keywordsPath))
descTfidf = json.load(open(descPath))

rank = Rank()
for numb, query in queriesDict.items():
    numbsDocs = queriesDocsDict[numb]
    tfidfs = zip(tfidfPages[numb], titleTfidf[numb], keywordsTfidf[numb], descTfidf[numb])
    tfidfs = map(lambda x: sum(x) if x[0] is not None else -1, tfidfs)
    #tfidfs = map(lambda x: x if x is not None else -1, tfidfs)
    pairs = sorted(zip(numbsDocs, tfidfs), key=lambda x: x[1], reverse=True)
    badPairs = list(filter(lambda x: x[1] == -1, pairs))
    goodPairs = list(filter(lambda x: x[1] != -1, pairs))
    for badPair in badPairs:
        goodPairs.insert(randint(0, len(goodPairs)), badPair)
    rank.setRanks(numb, list(map(lambda x: x[0], goodPairs)))
rank.save(open('../results/3.csv', 'w'))

** Ранжирование по tfidf взвешенному по областям **

In [11]:
tags = ["p", "div", "br", "span", "img", "h1", "h2", "h3", "h4", "h5", "h6", "b", "strong", "i"]
weights = {}
for tag in ["p", "div", "br", "span", "img", "b", "strong", "i"]:
    weights[tag] = 1
for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
    weights[tag] = 1
weights["title"] = 1
weights["description"] = 1
weights["keywords"] = 1

In [67]:
weights = {}
weights["text"] = 1
weights["title"] = 0.4
weights["description"] = 0.4
weights["keywords"] =  0.4

weights["xtext"] = 2
weights["xtitle"] = 0.4
weights["xdescription"] = 0.4
weights["xkeywords"] =  0.4

In [59]:
tfidfPath = '../data/features/xxxtdidf'
tfidfs = json.load(open(tfidfPath))

In [60]:
tfIdfFullText = '../data/features/tfidfFullText'
titlePath = '../data/features/tfidfTitle'
keywordsPath = '../data/features/tfidfKeywords'
descPath = '../data/features/tfidfDesc'

#tfIdfFullPagesPath = '../data/tfidfFullPages'
tfidfPages = json.load(open(tfIdfFullText))
titleTfidf = json.load(open(titlePath))
keywordsTfidf = json.load(open(keywordsPath))
descTfidf = json.load(open(descPath))
for numb, query in queriesDict.items():
    numbsDocs = queriesDocsDict[numb]
    for i, numbDoc in enumerate(numbsDocs):
        if tfidfs[numb][str(numbDoc)] is None:
            continue
        tfidfs[numb][str(numbDoc)]["xtext"] = tfidfPages[numb][i]
        tfidfs[numb][str(numbDoc)]['xtitle'] = titleTfidf[numb][i]
        tfidfs[numb][str(numbDoc)]['xkeywords'] = keywordsTfidf[numb][i]
        tfidfs[numb][str(numbDoc)]['xdescription'] = descTfidf[numb][i]

In [68]:
rank = Rank()
for numbQuery, docs in tfidfs.items():
    scores = []
    for numbDoc, tfidfDict in docs.items():
        if tfidfDict is None:
            scores.append((numbDoc, -1))
            continue
        sumScore = 0
        for tag, score in tfidfDict.items():
            sumScore += score * weights[tag]
        scores.append((numbDoc, sumScore))    
    pairs = sorted(scores, key=lambda x: x[1], reverse=True)
    badPairs = list(filter(lambda x: x[1] == -1, pairs))
    goodPairs = list(filter(lambda x: x[1] != -1, pairs))
    for badPair in badPairs:
        #goodPairs.insert(randint(0, len(goodPairs)), badPair)
        goodPairs.append(badPair)
    rank.setRanks(numbQuery, list(map(lambda x: x[0], goodPairs)))


In [69]:
rank.save(open('../results/22.scv', 'w'))