In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from glob import glob
import pandas as pd
from scipy import spatial
import operator
from nltk.corpus import stopwords

## Part I: IDF

Stop words and symbols

In [40]:
# C Language's Keywords
keyword = ["auto","break","case","char","const","continue","default","do","double","else","enum","extern","float","for","goto","if","int","long","register","return","short","signed","sizeof","static","struct","switch","typedef","union","unsigned","void","volatile","while", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"]
# english stop word
keyword += stopwords.words('english')
sym = ["+", "@", "~", "*", "-", "/", "#", "{", "}", "(", ")", ";", ">", "<", "=", ":", "!", "|", "?", "'", '"', ".", ","]

File Preprocessing

In [41]:
# Load all files
files = glob(".\IS706\documents\*.txt")
data = []
filename = []
for file in files:
    filename.append(file.replace(".\\IS706\\documents\\", ""))
    with open(file) as doc:
        # Lowercase
        content = doc.read().lower().strip()
        # Remove symbol
        for symbol in sym:
            content = content.replace(symbol, " ")
        # Remove stopwords and \n, \t
        content = content.replace("\n", " ").replace("\\n", " ").replace("\t", " ").split(" ")
        content = " ".join([w for w in content if w != "" and w[0] not in "0123456789" and w not in keyword and len(w) > 1])

    data.append(content)
data

['init crypto authenc module init crypto template crypto authenc tmpl',
 'snd rawmidi ioctl status compat snd rawmidi file rfile snd rawmidi status32 user src err snd rawmidi status status rfile output null einval get user status stream src stream efault status stream sndrv rawmidi stream output err snd rawmidi output status rfile output status sndrv rawmidi stream input err snd rawmidi input status rfile input status einval err err put user status tstamp tv sec src tstamp tv sec put user status tstamp tv nsec src tstamp tv nsec put user status avail src avail put user status xruns src xruns efault',
 'snd ctl make virtual master create virtual master control name name string control element create tlv optional tlv array db information creates virtual matster control given name string returns created control element null errors enomem creating vmaster element add slave controls via snd ctl add slave snd ctl add slave uncached optional argument tlv used specify tlv information db scale 

Compute IDF and TF-IDF

In [42]:
# Compute TF-IDF matrix
tvec = TfidfVectorizer(min_df=2)
tvec_weights = tvec.fit_transform(data)
b2 = tvec.get_feature_names()
# TF-IDF Matrix
tfidf = tvec_weights.toarray()
# IDF Dictionary {term: idf}
idfLst = {term: idf for term, idf in zip(tvec.get_feature_names(), tvec.idf_)}

## Part II: TF-IDF

Query Preprocessing

In [43]:
files = glob(".\IS706\queries\*.txt")
queries = []
for file in files:
    with open(file) as doc:
        content = doc.read().lower().strip()
        for symbol in sym:
            content = content.replace(symbol, " ")
        content =" ".join([i for i in content.replace("\n", " ").replace("\\n", " ").replace("\t", " ").split(" ") if i != "" and i not in keyword and len(i)>1])
    queries.append(content)
queries

['task delay acct enable per task delay accounting experimental collect information time spent task waiting system resources like cpu synchronous block completion swapping pages statistics help setting task priorities relative tasks cpu io rss limits etc say unsure',
 'blk cgroup block io controller generic block io controller cgroup interface common cgroup interface used various io controlling policies currently cfq io scheduler uses recognize task groups control disk bandwidth allocation proportional time slice allocation task groups also used bio throttling logic block layer implement upper limit io rates device option enables generic block io controller infrastructure one needs also enable actual io controlling logic policy enabling proportional weight division disk bandwidth cfq seti config_cfq_group_iosched enabling throttling policy set config_blk_throttle see documentation cgroups blkio controller txt information',
 'blk dev initrd initial ram filesystem ram disk initramfs init

Compute term frequency

In [44]:
# Compute word count
cQ = CountVectorizer(min_df=1)
tf_Query = cQ.fit_transform(queries)
feature_Query = cQ.get_feature_names()
tf_Query = tf_Query.toarray()

# Convert count to frequency
newTF = []
for doc in tf_Query:
    #print(doc)
    tmp = []
    for ele in doc:
        tmp.append(ele/sum(doc))
    newTF.append(tmp)
tf_Query = newTF

Find similar documents 

In [45]:
q = 1
for doc in tf_Query:
    
    print("Query ",q,":")
    q += 1
    
    # Term-freq dictionary for queries
    Query_tf = {term: tf for term, tf in zip(feature_Query, list(doc))}
    top5 = sorted(Query_tf.items(), key=operator.itemgetter(1), reverse=True)
    top5 = [i[0] for i in top5]
    print(", ".join(top5[:10]))
    
    # Vector representation of query
    vec = []
    for k, v in idfLst.items():
        if k in Query_tf:
            vec.append(v * Query_tf[k])
        else:
            vec.append(0)
            
    # Find the most similar doc
    ranker = {}
    top5W = {}
    index = 0
    for i in tfidf:
        sim = 1 - spatial.distance.cosine(i, vec)
        if sim > 0:
            ranker[index] = sim
            
            # Top five word
            top5 = {term: tfidf for term, tfidf in zip(b2, list(i))}
            top5 = sorted(top5.items(), key=operator.itemgetter(1), reverse=True)
            top5W[index] = [i[0] for i in top5]
        
        index += 1
        
    # Descendent sort
    sorted_ranker = sorted(ranker.items(), key=operator.itemgetter(1), reverse=True)
    
    # Result
    for res in sorted_ranker[:10]:
        print(filename[res[0]], ";", res[1], ";", ", ".join(top5W[res[0]][:10]))
    print()

Query  1 :
task, cpu, delay, accounting, acct, block, collect, completion, enable, etc
document1138.txt ; 0.2671735982121881 ; delay, timer, azf3328, countdown, snd, chip, flags, value, lock, minimum
document1498.txt ; 0.15537632182444716 ; ttm, tt, user, space, pages, populate, task, start, valid, address
document343.txt ; 0.13720533381246902 ; cpu, u64, tt, be128, gf128mul, bbe, table, 120, able, ablkcipher
document2325.txt ; 0.12988264078067457 ; cpumask, cpu, pcrypt, padata, cb, rcu, mask, weight, parallel, bh
document2388.txt ; 0.12335718447124511 ; cpu, tfm, ctx, cipher, crypto, aead, instance, pcrypt, ictx, cpumask
document363.txt ; 0.10454727464817082 ; be64, cpu, u64, bbe, tt, gf128mul, x8, be128, table, 120
document355.txt ; 0.10437417297137674 ; be64, cpu, u64, lle, tt, gf128mul, x8, be128, table, 120
document335.txt ; 0.10283396944397705 ; be64, cpu, u64, bbe, tt, be128, gf128mul, table, 120, able
document1474.txt ; 0.10098368492473764 ; wait, bo, object, cpu, ttm, buffer, 