## IR System For Columnist Data

Ozgur DOGAN
170709026

In [1]:
import io, os
import re as re
import zipfile as zipfile
import math
columnistData = [] 
with zipfile.ZipFile('30Columnists.zip') as z:
    for zipinfo in z.infolist():
        mytextzip = ''
        if zipinfo.filename.endswith('.txt') and re.search('raw_texts', zipinfo.filename):
            with z.open(zipinfo) as f:
                textfile = io.TextIOWrapper(f, encoding='cp1254', newline='')
                for line in textfile:
                    if len(line.strip()): mytextzip += ' ' + line.strip()   
                columnistData.append(mytextzip)

In [2]:
indexing = dict()  # A dictionary for INVERTED INDEXING
for index,i in enumerate(columnistData):
    i = i.lower()
    splitted = i.split()
    for j in set(splitted):
        if j not in indexing.keys(): 
            indexing[j] = str(index)   # if we didn't see the word before add it and index of document to dict
        else:
            indexing[j] += ","+str(index)  # else add just the index of document

In [3]:
def calculateTfIdf(indexDict,query,postingList):
    try:
        docs = indexDict[query].split(',') 
    except:
        return None       # If letter is not exist in posts return None
    idfScore = len(postingList) / len(docs) + 1  # Added +1 to avoid divison by 0 error.  
    tfIdfScoreTotal = dict()
    for i in docs:
        counter = 0   # Number of the time the word occurs in the text
        splitted = postingList[int(i)].split()
        for j in splitted:
            if j == query :
                counter += 1
        tfscore = counter / len(splitted)
        tfIdfScoreTotal[i] = tfscore * idfScore
    return tfIdfScoreTotal

In [4]:
def calculateBM25(indexDict,query,postingList,fieldAvg):
    try:
        docs = indexDict[query].split(',') 
    except:
        return None   
    idfScore = len(postingList) / len(docs) + 1
    b = 0.75
    k1 = 1.6
    bm25scores = dict()
    for i in docs:
        counter = 0
        splitted = postingList[int(i)].split()
        docLen = len(splitted)
        fieldLenAvg = docLen/fieldAvg
        for j in splitted:
            if j == query :
                counter += 1
        tfscore = counter / docLen
        bm25scores[i] = idfScore * ((tfscore*(k1+1))/(tfscore + (k1*(1-b+(b*fieldLenAvg)))))
    return bm25scores

In [5]:
def calculateDFI(indexDict,query,postingList,fieldAvg):
    try:
        docs = indexDict[query].split(',') 
    except:
        return None
    dfiScores = dict()
    for i in docs:
        counter = 0
        splitted = postingList[int(i)].split()
        docLen = len(splitted)
        for j in splitted:
            if j == query :
                counter += 1
        tfscore = counter / len(splitted)
        eij = (tfscore*docLen)/fieldAvg
        dfiScores[i] = math.log2(((tfscore - eij)/math.sqrt(eij))+1)
    return dfiScores

In [6]:
def searchIndex(indexDict,query,postingList):
    query = query.lower()
    splitted_query = query.split()
    lastResultTfIdf = dict()
    lastResultBM25= dict()
    lastResultDFI = dict()
    counts = [len(x) for x in postingList]
    fieldAvg = sum(counts) / len(counts)
    for i in splitted_query:
        try:
            wordsScoreTfIdf = calculateTfIdf(indexDict,i,postingList)
            wordsScoreBM25 = calculateBM25(indexDict,i,postingList,fieldAvg)
            wordsScoreDFI = calculateDFI(indexDict,i,postingList,fieldAvg)       
            for j in wordsScoreTfIdf.keys():
                if j in lastResultTfIdf.keys():
                    lastResultTfIdf[j] += wordsScoreTfIdf[j]
                else :
                    lastResultTfIdf[j] = wordsScoreTfIdf[j]
                    
            for j in wordsScoreBM25.keys():
                if j in lastResultBM25.keys():
                    lastResultBM25[j] += wordsScoreBM25[j]
                else :
                    lastResultBM25[j] = wordsScoreBM25[j]
                    
            for j in wordsScoreDFI.keys():
                if j in lastResultDFI.keys():
                    lastResultDFI[j] += wordsScoreDFI[j]
                else :
                    lastResultDFI[j] = wordsScoreDFI[j]
        except:
            continue          # If any score is None(That means the word is not exist in posts) just continue
    lastResultTfIdf = sorted(lastResultTfIdf.items(), key = lambda kv:(kv[1], kv[0]),reverse = True)
    lastResultBM25 = sorted(lastResultBM25.items(), key = lambda kv:(kv[1], kv[0]),reverse = True)
    lastResultDFI = sorted(lastResultDFI.items(), key = lambda kv:(kv[1], kv[0]),reverse = True)
    return lastResultTfIdf[:10],lastResultBM25[:10],lastResultDFI[:10]  # Return best 3 matches

In [7]:
def find_map_score(keys):
    summ=0
    a=0
    for i in range(len(columnistData)):
        if i in keys:
            a+=1
            summ+=a/(i+1)
        else:
            summ+=a/(i+1)

    return summ/len(columnistData)

In [8]:
def getKeys(results):
    return [int(x[0]) for x in results]

In [16]:
def getBestWay(mapTf,mapBM25,mapDFI):
    print(f"TF/IDF Map Score: {mapTf}\nBM25 Map Score: {mapBM25}\nDFI Map Score: {mapDFI}")
    if (mapTf > mapBM25) & (mapTf > mapDFI):
        print("Best way is TF/IDF")
    elif (mapBM25 > mapTf) & (mapBM25 > mapDFI):
        print("Best way is BM25")
    else: 
        print("Best way is DFI")

In [9]:
queries = ("I knew you were trouble and I walked in","Imagine all the people living life in peace","If I fail, if I succeed At least I’ll live as I believe",
          "Poems, Prayers, and Promises","When life gets us down, this reminds us of the simplest pleasures","Nothing can stop me, I'm all the way up",
          "Hello, it's me","It ain’t no fun if the homies can’t have none.","I’m gonna live like tomorrow doesn’t exist.","This is out house. This is out rules")

In [10]:
len(queries)

10

In [12]:
totalMapTfidf = 0
totalMapBM25 = 0
totalMapDFI = 0
for i in queries:
    resultTfIdf,resultBM25,resultDFI  = searchIndex(indexing,i,columnistData)
    totalMapTfidf += find_map_score(getKeys(resultTfIdf))
    totalMapBM25 += find_map_score(getKeys(resultBM25))
    totalMapDFI += find_map_score(getKeys(resultDFI))   


In [17]:
getBestWay(totalMapTfidf,totalMapBM25,totalMapDFI)

TF/IDF Map Score: 0.05560013406205516
BM25 Map Score: 0.04625062569482008
DFI Map Score: 0.031667735023545335
Best way is TF/IDF


In [18]:
# Some Example Queries
query1 = queries[1]   # Imagine all the people living life in peace
resultTfIdf_query1,resultBM25_query1,resultDFI_query1  = searchIndex(indexing,query1,columnistData)

In [33]:
print(f"Results for '{query1}'")
print("\tTF/IDF\t\t\t\t\tBM25\t\t\t\t\t\tDFI")
for i in range(len(resultTfIdf)):
    print(f"{resultTfIdf_query1[i]}\t|\t{resultBM25_query1[i]}\t|\t{resultDFI_query1[i]}")

Results for 'Imagine all the people living life in peace'
	TF/IDF					BM25						DFI
('1007', 0.19704433497536947)	|	('1007', 0.922554089582927)	|	('1439', 1.258632317513575)
('668', 0.19337016574585636)	|	('668', 0.8407753020243999)	|	('1007', 1.2475799630860318)
('1339', 0.19327731092436976)	|	('1339', 0.7978724800280871)	|	('471', 1.231098995863422)
('1143', 0.18278427205100956)	|	('1407', 0.7924102972571379)	|	('457', 1.1655051475121136)
('318', 0.1793103448275862)	|	('1301', 0.7718154126607761)	|	('1436', 1.1593761001808718)
('333', 0.1761904761904762)	|	('997', 0.7615951126150894)	|	('997', 1.1525080710925903)
('1343', 0.17536534446764093)	|	('1310', 0.760120180372431)	|	('1068', 1.1104904508024336)
('628', 0.17513513513513512)	|	('1068', 0.7596285751712818)	|	('464', 1.1082932498083626)
('600', 0.1737331954498449)	|	('489', 0.7502249751159125)	|	('1071', 1.1029044595345425)
('634', 0.17372881355932204)	|	('471', 0.7415926951796092)	|	('1062', 1.0999854337589454)


In [35]:
query2 = queries[5]   # Nothing can stop me, I'm all the way up
resultTfIdf_query2,resultBM25_query2,resultDFI_query2  = searchIndex(indexing,query2,columnistData)

In [37]:
print(f"Results for '{query2}'")
print("\tTF/IDF\t\t\t\t\tBM25\t\t\t\t\t\tDFI")
for i in range(len(resultTfIdf)):
    print(f"{resultTfIdf_query2[i]}\t|\t{resultBM25_query2[i]}\t|\t{resultDFI_query2[i]}")

Results for 'Nothing can stop me, I'm all the way up'
	TF/IDF					BM25						DFI
('1007', 0.19704433497536947)	|	('1007', 0.922554089582927)	|	('1439', 1.258632317513575)
('668', 0.19337016574585636)	|	('668', 0.8407753020243999)	|	('1007', 1.2475799630860318)
('1339', 0.19327731092436976)	|	('1339', 0.7978724800280871)	|	('471', 1.231098995863422)
('1143', 0.18278427205100956)	|	('1407', 0.7924102972571379)	|	('457', 1.1655051475121136)
('318', 0.1793103448275862)	|	('1301', 0.7718154126607761)	|	('1436', 1.1593761001808718)
('333', 0.1761904761904762)	|	('997', 0.7615951126150894)	|	('997', 1.1525080710925903)
('1343', 0.17536534446764093)	|	('1310', 0.760120180372431)	|	('1068', 1.1104904508024336)
('628', 0.17513513513513512)	|	('1068', 0.7596285751712818)	|	('464', 1.1082932498083626)
('600', 0.1737331954498449)	|	('489', 0.7502249751159125)	|	('1071', 1.1029044595345425)
('634', 0.17372881355932204)	|	('471', 0.7415926951796092)	|	('1062', 1.0999854337589454)


In [44]:
query3 = queries[0]   # I knew you were trouble and I walked in
resultTfIdf_query3,resultBM25_query3,resultDFI_query3  = searchIndex(indexing,query3,columnistData)

In [46]:
print(f"Results for '{query3}'")
print("\tTF/IDF\t\t\t\t\tBM25\t\t\t\t\t\tDFI")
for i in range(len(resultBM25_query3)):
    print(f"{resultTfIdf_query3[i]}\t|\t{resultBM25_query3[i]}\t|\t{resultDFI_query3[i]}")

Results for 'I knew you were trouble and I walked in'
	TF/IDF					BM25						DFI
('773', 0.178118917260358)	|	('1435', 0.8985746411399416)	|	('1435', 1.1489138952360034)
('777', 0.17589727863784746)	|	('773', 0.8294532534004507)	|	('1078', 1.015733463259284)
('219', 0.17428354213658284)	|	('777', 0.7751178263528592)	|	('1098', 0.989227957709527)
('700', 0.16956643357506024)	|	('219', 0.7659277449735132)	|	('1094', 0.9535448087044023)
('1435', 0.16304675956332207)	|	('700', 0.7343541676928076)	|	('1053', 0.9498144221818301)
('1133', 0.16093595328415472)	|	('1133', 0.6724457859315998)	|	('1051', 0.9274051362893126)
('48', 0.1389698993796905)	|	('1098', 0.5723973548765179)	|	('1062', 0.923761533445893)
('243', 0.13455471162651977)	|	('48', 0.5598260338767538)	|	('1069', 0.914186877819127)
('229', 0.1336163124514179)	|	('229', 0.526933849366081)	|	('1081', 0.9047710383027058)
('232', 0.12705124525781353)	|	('1078', 0.5088010614655191)	|	('1082', 0.8982058818067182)
