In [144]:
import os
import functools
import xml.sax
import re
from spacy.lang.en import English
from Stemmer import Stemmer
from collections import defaultdict
nlp = English()

In [304]:
inv_index = defaultdict(lambda: defaultdict(lambda : defaultdict(int)))
posting_list = defaultdict(lambda:[])
tfdict = {}

In [289]:
def tokenize(string):
    tokens = re.split(r'[^A-Za-z0-9]+',string)
    return tokens
def remove_stop(tokens):
    arr = []
    for t in tokens:
        t = str(t)
        if t =='\n':
            continue
        if len(t) > 8:
            try:
                int(t,16)
                continue
            except:
                pass
        if len(t) != 0 and nlp.vocab[t].is_stop == False:
            arr.append(t.lower())
    return arr
def stem_words(tokens):
    arr = []
    ps  = Stemmer('porter')
    for token in tokens:
        arr.append(ps.stemWord(token))
    return arr

In [307]:
def parse_title(data,cur_id):
    tf = 0
    tokens = tokenize(data)
    tokens = remove_stop(tokens)
    tokens = stem_words(tokens)
    for tok in tokens:
        inv_index[tok][cur_id]['t'] += 1
        tf += 1
    return tf

def parse_coll(data,cur_id):
    try:   
        tf = 0
        categories = re.findall("\[\[Category:(.*?)\]\]", data)
        for cat in categories:
            tokens = tokenize(cat)
            tokens = remove_stop(tokens)
            tokens = stem_words(tokens)
            for tok in tokens:
                inv_index[tok][cur_id]['c'] += 1
                tf += 1
    except:
        tokens = []
    return tf   
    
def parse_refs(data,cur_id):   
    try:
        tokens = []
        tf = 0
        link_list = re.findall("== *[Rr]eferences *==(.*?\n)+?\n",data)
        for link in link_list:
            if 'Category' in link or 'reflist' in link or 'Reflist' in link or 'refend' in link:
                continue
            tokens = tokenize(link)
            tokens = remove_stop(tokens)
            tokens = stem_words(tokens)
            for tok in tokens:
                inv_index[tok][cur_id]['r'] += 1
                tf += 1
    except:
        pass
    return tf
        
def parse_links(data,cur_id):
    try:
        a = data.split("==External links==",1)
        tokens = []
        tf = 0
        link_list = re.findall("\*{{(.*?)}}",a[1])
        for link in link_list:
            tokens = tokenize(link)
            tokens = remove_stop(tokens)
            tokens = stem_words(tokens)
            for tok in tokens:
                inv_index[tok][cur_id]['l'] += 1
                tf += 1
    except:
        pass
    return tf

def parse_info(data,cur_id):
    try:
        tf = 0
        info = data.split('{{Infobox')
        for toki in info:
            link_list = toki.split('\n')
            for link in link_list:
                if "=" in link:
                    tokens = link.split("=")[1]
                    tokens = tokenize(tokens)
                    tokens = remove_stop(tokens)
                    tokens = stem_words(tokens)
                    for tok in tokens:
                        inv_index[tok][cur_id]['i'] += 1
                        tf += 1
    except:
        pass
    return tf

def parse_text(data,cur_id):
    tokens = []
    tf = 0
    try:
        tokens = tokenize(data)
        tokens = remove_stop(tokens)
        tokens = stem_words(tokens)
    except:
        tokens = []
    for tok in tokens:
        inv_index[tok][cur_id]['b'] += 1
        tf += 1
    return tf

In [313]:
def create_index():
    words = sorted(inv_index.keys())
    for word in words:
        v1 = inv_index[word]
        for doc,v2 in v1.items():
            val = doc
            for typ in sorted(v2): 
                val += typ+str(v2[typ])
            posting_list[word].append(val)

def write_to_file(filename):
    with open(filename,'w+') as fil:
        for key in posting_list.keys():
            fil.write(key+"|")
            for entry in  posting_list[key]:
                fil.write(entry+"|")
            fil.write("\n")

def write_tf(filename):
    with open(filename,'w+') as fil:
        for key in tfdict.keys():
            fil.write(str(key)+'|'+str(tfdict[key][0])+'|'+str(tfdict[key][1])+'\n')

In [350]:
class WikiHandler(xml.sax.ContentHandler):
    def __init__(self):
        inv_index.clear()
        posting_list.clear()
        self.data = ""
        self.title = ""
        self.cur_id = 0
        self.limit = 1000
        self.count = 0
        self.tf = 0
        
    def startElement(self, tag, attributes):
        self.data = ""
        
    def endElement(self, tag):
        if tag == 'page':
            if (self.cur_id+1) % self.limit == 0:
                create_index()
                write_to_file("index/"+str(self.count)+".txt")
                posting_list.clear()
                inv_index.clear()
                self.count += 1
            tfdict[self.cur_id] = [self.title,str(self.tf)]
            self.tf = 0
            self.title = ""
            self.cur_id += 1
        
        elif tag == 'text':
            self.tf += parse_text(self.data,str(self.cur_id))
            self.tf += parse_coll(self.data,str(self.cur_id))
            self.tf += parse_links(self.data,str(self.cur_id))
            self.tf += parse_refs(self.data,str(self.cur_id))
            self.tf += parse_info(self.data,str(self.cur_id))
            self.data = ""

        elif tag == 'title':
            self.title = self.data
            self.tf += parse_title(self.data,str(self.cur_id))
            self.data = ""

    def characters(self, content):
        self.data += content

In [352]:
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)

Handler = WikiHandler()
parser.setContentHandler(Handler)
parser.parse('enwiki-latest-pages-articles26.xml-p42567204p42663461')
create_index()
write_to_file("index/"+str(Handler.count)+".txt")
write_tf("index/tf.txt")
merge_files('index/')
unmerge_files('index/','0.txt')

In [347]:
def comp(a):
    return int(a.split('.')[0])

def unmerge_files(folder,filename):
    thresh = 100000
    os.rename(folder+filename,folder+'temp.txt')
    cur = 0
    file1 = open(folder+'temp.txt','r')
    file2 = open(folder+'sindex.txt','w+')
    line = file1.readline()
    while len(line):
        count = 1
        file2.write(str(cur)+'.txt|'+line.split('|')[0]+'\n')
        with open(folder+str(cur)+'.txt','w+') as fil:
            while count <= thresh and len(line):
                fil.write(line)
                count += 1
                line = file1.readline()
        cur += 1
    file1.close()
    file2.close()
    os.remove(folder+'temp.txt')

In [348]:
def merge_files(folder):
    while(1):
        files = os.listdir(folder)
        files.remove('tf.txt')
        files = sorted(files,key=comp)
        if len(files) == 1:
            break
        for i in range(1,len(files),2):
            merge_func(folder+files[i-1],folder+files[i])


In [321]:
def merge_func(file1,file2):
    
    fil1 = open(file1,'r')
    fil2 = open(file2,'r')
    final = open('index/temp.txt','w+')
    line1 = fil1.readline().strip('\n')
    line2 = fil2.readline().strip('\n')
    
    while len(line1) and len(line2):
        w1 = line1.split("|")[0]
        w2 = line2.split("|")[0]
        if w1 < w2:
            final.write(line1+'\n')
            line1 = fil1.readline().strip('\n')
        elif w2 < w1:
            final.write(line2+'\n')
            line2 = fil2.readline().strip('\n')
        else:
            temp = "|".join(line2.split("|")[1:-1])
            final.write(line1+temp+"|"+'\n')
            line1 = fil1.readline().strip('\n')
            line2 = fil2.readline().strip('\n')
    
    while len(line1):
        final.write(line1+'\n')
        line1 = fil1.readline().strip('\n')
    
    while len(line2):
        final.write(line2+'\n')
        line2  = fil2.readline().strip('\n')
        
    fil1.close()
    fil2.close()
    final.close()
    os.remove(file1)
    os.remove(file2)
    os.rename('index/temp.txt',file1)
    
merge_func('index/0.txt','index/1.txt')

In [35]:
"|".join("world|0b4|0b3c1t1|".split("|")[1:-1])

'0b4|0b3c1t1'

In [63]:
for i in range(1,9,2):
            print(i)

1
3
5
7


In [183]:
temp = ['18.txt','2.txt']

In [184]:
temp.sort(key = functools.cmp_to_key(comp) )

In [185]:
temp

['18.txt', '2.txt']