In [1]:
import os
import math
import nltk
import time
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

### Cosine Similarity Function

In [2]:
# Cosine Similarity
def cosine_similarity(x, y):
    x_sqrt = np.sqrt(np.dot(x, x))
    y_sqrt = np.sqrt(np.dot(y, y))
    if y_sqrt != 0:     
        return (np.dot(x,y.T) / (x_sqrt * y_sqrt))
    elif y_sqrt == 0:
        return 0

In [3]:
# check cosine_similarity
print (cosine_similarity(np.array([1,2,3]), np.array([1,2,3])))
print (cosine_similarity(np.array([3,4,5]), np.array([1,3,4])))

1.0
0.970725343394151


### Make Inverted Index & Document Count

In [4]:
def clean_str(texts):
    # input : string that needs to clean all number and signs

    texts = re.sub('cnn',' cnn ', texts)
    texts = re.sub('\'', ' ', texts)
    texts = ''.join(c for c in texts if c not in string.punctuation)
    texts = ''.join([c for c in texts if not c.isdigit()])
    
    return texts

In [5]:
### doc2vocab ###
# Doc 0 : (vocab 1 : num 1), (vocab 2 : num 2), (vocab 3 : num 3)
# Doc 1 : ... 

### vocab2doc ###
# word : [text_num1, text_num2 ... ]

doc2vocab  = dict()
vocab2doc  = dict()

for i in range(0,60):
    doc2vocab[i] = dict()

    with open('./data/%d.txt' % i, 'r', encoding="utf-8") as doc:
        read_string = doc.read()                       # get sentence as read function
        read_string = read_string.lower()              # sentence lower
        read_string = clean_str(read_string)           # clean all punctuation and number
        
        tokens = nltk.word_tokenize(read_string)       # get tokens of sentence
        stop = set(stopwords.words('english'))
        tokens = [j for j in tokens if j not in stop] # get rid of stopwords at token
        
        ### get shape of {doc : {word1 : word1_num, word2 : word2_num, .... }}
        for words in tokens:
            # make document and vocab pair dictionary
            if words in doc2vocab[i]:
                doc2vocab[i][words] += 1
                
            else:
                doc2vocab[i][words] = 1
            
            # make inverted index, {word : [doc1, doc3, ... ]}
            text_str = str(i) + '.txt'
            if words in vocab2doc:
                if text_str not in vocab2doc[words]:
                    vocab2doc[words].append(text_str)
                    
            else:
                vocab2doc[words] = list()
                vocab2doc[words].append(text_str)

### (1) Inverted Index 실행 결과 : president, obama 각각에 대한 list posting 결과

In [6]:
# Inverted Index Posting Lists Result
print ('president : ', vocab2doc['president'])
print ('obama : ', vocab2doc['obama'])

president :  ['6.txt', '7.txt', '9.txt', '14.txt', '17.txt', '30.txt', '36.txt', '40.txt', '41.txt', '45.txt', '46.txt', '47.txt', '48.txt', '49.txt', '50.txt', '51.txt', '52.txt', '53.txt', '54.txt', '55.txt', '56.txt', '57.txt', '58.txt', '59.txt']
obama :  ['6.txt', '36.txt', '40.txt', '41.txt', '43.txt', '44.txt', '46.txt', '47.txt', '48.txt', '49.txt', '50.txt', '53.txt', '54.txt', '57.txt', '58.txt']


# Compute Term Frequency and Inverted Document Frequency

In [7]:
term_pd = pd.DataFrame.from_dict(doc2vocab, orient='index')
term_pd = term_pd.fillna(0)
term_pd.head()

Unnamed: 0,cnn,remember,miley,twerking,kanye,micsnatching,know,music,awards,get,...,centers,muchneeded,contact,homebound,ancillary,inperson,delivery,decreased,rate,falls
0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,7.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Get query and Document TF-IDF weight

In [8]:
# get query from the query.txt
query = list()

f = open('./query.txt', 'r')
query = f.readlines()

In [9]:
def doc_tf_idf(dataframe, query):
    
    # query tf-idf
    _, width = dataframe.shape
    final = list()
    
    # document tf-idf 
    new_tf = dataframe
    doc_term_value = dataframe[dataframe > 0].count().values # get array of number that document has that term
    doc_frequency = np.log(60 / (doc_term_value + 1))
    
    start = time.time()
    for i in range(len(dataframe)):
        results = np.zeros(width)
        one_row = dataframe.loc[i]
        row_value = one_row.values
        row_index = one_row.index
        
        for j,term in enumerate(row_index):
            if row_value[j] > 0:
                #term_frequency = 1 + np.log(row_value[j])
                term_frequency = np.log(row_value[j] + 1)
                new_tf.iloc[i,j] = term_frequency * doc_frequency[j]
                    
            elif row_value[j] == 0:
                term_frequency = 0
                new_tf.iloc[i,j] = 0
                
            if term in query:
                new_column = dataframe[[term]]
                new_col_value = new_column[new_column > 0].count().values
                results[j] = term_frequency * (np.log(60 / (new_col_value[0]+1)))
        
        final.append((i, cosine_similarity(new_tf.loc[i].values, results)))
    
        if i % 10 == 0:
            print ('step : %d, time : %f' % (i, time.time()-start))
            
    return new_tf, final

query_token = nltk.word_tokenize(query[0])
term_doc_matrix, query_tf_idf = doc_tf_idf(term_pd, query_token)

step : 0, time : 2.366121
step : 10, time : 25.050186
step : 20, time : 46.818120
step : 30, time : 69.090080
step : 40, time : 91.737705
step : 50, time : 116.275198


### (2) TF-IDF 실행결과 : president, obama를 포함한 term-document incidence matrix

In [10]:
print (term_doc_matrix[['president']])
print (term_doc_matrix[['obama']])

    president
0    0.000000
1    0.000000
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.606829
7    0.606829
8    0.000000
9    0.606829
10   0.000000
11   0.000000
12   0.000000
13   0.000000
14   0.606829
15   0.000000
16   0.000000
17   0.961801
18   0.000000
19   0.000000
20   0.000000
21   0.000000
22   0.000000
23   0.000000
24   0.000000
25   0.000000
26   0.000000
27   0.000000
28   0.000000
29   0.000000
30   0.606829
31   0.000000
32   0.000000
33   0.000000
34   0.000000
35   0.000000
36   0.606829
37   0.000000
38   0.000000
39   0.000000
40   0.961801
41   1.703584
42   0.000000
43   0.000000
44   0.000000
45   0.606829
46   0.961801
47   1.409013
48   0.961801
49   1.923601
50   0.961801
51   1.820486
52   1.213657
53   0.606829
54   1.820486
55   0.606829
56   0.961801
57   1.409013
58   1.923601
59   0.961801
       obama
0   0.000000
1   0.000000
2   0.000000
3   0.000000
4   0.000000
5   0.000000
6   2.127284
7   0.000000
8   0.000000
9   0.000000
10 

### (3) Cosine-Similarity 실행결과 : president obama에 대한 Top 5 Document (Cosine-Similarity 결과값 포함)

In [11]:
# (Document number, Cosine-Similarity between query and document)
score = sorted(query_tf_idf, key = lambda x : x[1], reverse=True)
score

[(54, 0.13070279880949837),
 (46, 0.09454470773792908),
 (58, 0.09291409102865486),
 (48, 0.07211185450587057),
 (50, 0.06748251115357565),
 (47, 0.06504438157882764),
 (55, 0.06370009984030921),
 (49, 0.06357678505803718),
 (6, 0.05875878994779375),
 (40, 0.058738137632291534),
 (43, 0.054313083339851576),
 (44, 0.05125329107048038),
 (41, 0.04133076142720179),
 (53, 0.04129233597173457),
 (56, 0.04037795307184632),
 (36, 0.039176545214350475),
 (45, 0.03710784434355158),
 (57, 0.03643747968883324),
 (51, 0.03573915479294042),
 (17, 0.03537844802603371),
 (14, 0.029618885627681935),
 (59, 0.026683852303785734),
 (7, 0.024671309354004366),
 (52, 0.023993171076371327),
 (30, 0.019608834011474948),
 (9, 0.013682777499492668),
 (0, 0),
 (1, 0),
 (2, 0),
 (3, 0),
 (4, 0),
 (5, 0),
 (8, 0),
 (10, 0),
 (11, 0),
 (12, 0),
 (13, 0),
 (15, 0),
 (16, 0),
 (18, 0),
 (19, 0),
 (20, 0),
 (21, 0),
 (22, 0),
 (23, 0),
 (24, 0),
 (25, 0),
 (26, 0),
 (27, 0),
 (28, 0),
 (29, 0),
 (31, 0),
 (32, 0),
 (3