## Import libraries

In [None]:
from bs4 import BeautifulSoup
import urllib.request
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np
import itertools
import pandas as pd
import pickle
from IPython.display import display, clear_output

## Generate corpus for documents

In [None]:
# scrap documents from digilib.its.ac.id

docs_num = 400
digilib_url = 'http://digilib.its.ac.id/publisher/51100/'
paper = []
for i in range(0,docs_num,20):
    page = urllib.request.urlopen(digilib_url+str(i))
    soup = BeautifulSoup(page, 'html.parser')
    docs = soup.find_all('span', attrs={'class': 'style5'})
    link = []
    for x in docs:
        try:
            link.append(x.find('a').get('href'))
        except:
            pass
    for x in link[:20]:
        clear_output(wait=True)
        page = urllib.request.urlopen(x)
        soup = BeautifulSoup(page, 'html.parser')
        try:
            title = soup.find('h2', attrs={'class': 'isi'}).find('i').getText()
            abstract = soup.find('span', attrs={'class': 'teks'}).find('p').getText()
            paper.append([x, title, abstract])
            display(x)
        except:
            pass
print("Number of papers with abstract found: " +str(len(paper))+ " papers.")

In [None]:
# save results to 'corpus/paper.xlsx'

print("Saving data to corpus/paper.xlsx..")
df = pd.DataFrame(paper)
df.to_excel('corpus/paper.xlsx', header=False, index=False)
print("Success.")

In [None]:
# save results to 'pickle/paper.pkl'

print("Saving data to pickle/paper.pkl..")
with open('pickle/paper.pkl', 'wb') as f:
    pickle.dump(paper, f)
print("Success.")

## Preprocess the documents

In [None]:
# preprocessing

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()
words = []
processed_paper = []
for x in tqdm(paper, desc='paper', unit='paper'):
    text = x[2]
    text = text.lower()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    text = text.translate(remove_punctuation_map)
    text = stopword.remove(text)
    text = text.split()
    text = [stemmer.stem(x) for x in text]
    processed_paper.append(' '.join(text))
    text = list(set(text))
    words += text

In [None]:
# save results to 'corpus/processed_paper.xlsx'

print("Saving data to corpus/processed_paper.xlsx..")
df = pd.DataFrame(processed_paper)
df.to_excel('corpus/processed_paper.xlsx', header=False, index=False)
print("Success.")

In [None]:
# save results to 'pickle/processed_paper.pkl'

print("Saving data to pickle/processed_paper.pkl..")
with open('pickle/processed_paper.pkl', 'wb') as f:
    pickle.dump(processed_paper, f)
print("Success.")

In [None]:
# save words to 'corpus/words.xlsx'

print("Saving data to corpus/words.xlsx..")
df = pd.DataFrame(words)
df.to_excel('corpus/words.xlsx', header=False, index=False)
print("Success.")

In [None]:
# save words to 'pickle/words.pkl'

print("Saving data to pickle/words.pkl..")
with open('pickle/words.pkl', 'wb') as f:
    pickle.dump(words, f)
print("Success.")

## Generate thesaurus

In [None]:
# scrap from sinonimkata.com

thesaurus = {}
words = list(set(words))
for x in tqdm(words, desc='word', unit='word'):
    name = x
    data = { "q": name }
    encoded_data = urllib.parse.urlencode(data).encode("utf-8")
    content = urllib.request.urlopen("http://www.sinonimkata.com/search.php", encoded_data)
    soup = BeautifulSoup(content, 'html.parser')
    try:
        synonym = soup.find('td', attrs={'width': '90%'}).find_all('a')
        synonym = [x.getText() for x in synonym]
        thesaurus[x] = [x] + synonym
    except:
        thesaurus[x] = [name]

In [None]:
# save results to 'corpus/thesaurus.xlsx'

print("Saving data to corpus/thesaurus.xlsx..")
df = pd.DataFrame(thesaurus)
df.to_excel('corpus/thesaurus.xlsx', header=False, index=False)
print("Success.")

In [None]:
# save results to 'pickle/thesaurus.pkl'

print("Saving data to pickle/thesaurus.pkl..")
with open('pickle/thesaurus.pkl', 'wb') as f:
    pickle.dump(thesaurus, f)
print("Success.")

## Testing

### Test 1. Query: 'pengembangan aplikasi'

In [None]:
# insert query here

init_query = 'pengembangan aplikasi'

#### Without query expansion:

In [None]:
# build tf_idf

vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]

In [None]:
# process the query

max_result = []
x = [' '.join(query)]
paper_tfidf = vectorizer.fit_transform(x + processed_paper)
q = paper_tfidf[0]
result = cosine_similarity(paper_tfidf, q)
idx = np.argsort(-result,axis=0).flatten()    
final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass

In [None]:
# show top 5 results

for x in new_result[1:6]: 
    print('Result', x[0]) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

In [None]:
idx_res = [x[0]-1 for x in new_result[1:]]
file = []
for i,x in enumerate(paper):
    if i in idx_res:
        file.append([x[1],x[2],'ok_ok'])
    else:
        file.append([x[1],x[2],''])
df = pd.DataFrame(file)
df.to_excel('hasil/hasil ori ' +init_query+ '.xlsx', header=False, index=False)

#### With query expansion:

In [None]:
# build tf-idf

vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]
query

In [None]:
product_query = []
list_synonym = []
for x in query:
    if x in words:
        list_synonym.append(thesaurus[x])
    else:
        name = x
        data = { "q": name }
        encoded_data = urllib.parse.urlencode(data).encode("utf-8")
        content = urllib.request.urlopen("http://www.sinonimkata.com/search.php", encoded_data)
        soup = BeautifulSoup(content, 'html.parser')
        try:
            synonym = soup.find('td', attrs={'width': '90%'}).find_all('a')
            synonym = [x.getText() for x in synonym]
            thesaurus[x] = [x] + synonym
            list_synonym.append(thesaurus[x])
        except:
            list_synonym.append([x])
qs = []
for x in itertools.product(*list_synonym):
    x = [stemmer.stem(y) for y in x]
    qs.append([' '.join(x)])
for x in qs:
    print(x, end=' ')

In [None]:
max_result = []
for x in qs:
    paper_tfidf = vectorizer.fit_transform(x + processed_paper)
    q = paper_tfidf[0]
    result = cosine_similarity(paper_tfidf, q)
    idx = np.argsort(-result,axis=0).flatten()    
    final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
    max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass
len(new_result)

In [None]:
for x in new_result[1:5]: 
    print('Result', x[0]+1) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

In [None]:
idx_res = [x[0]-1 for x in new_result[1:]]
file = []
for i,x in enumerate(paper):
    if i in idx_res:
        file.append([x[1],x[2],'ok_ok'])
    else:
        file.append([x[1],x[2],''])
df = pd.DataFrame(file)
df.to_excel('hasil/hasil expansion ' +init_query+ '.xlsx', header=False, index=False)