## Import libraries

In [1]:
from bs4 import BeautifulSoup
import urllib.request
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np
import pickle
import itertools
import pandas as pd

## Import corpus for documents

In [2]:
with open('pickle/paper.pkl', 'rb') as f:
    paper = pickle.load(f)
print("Number of papers with abstract found: " +str(len(paper))+ " papers.")

Number of papers with abstract found: 328 papers.


## Import preprocessed paper

In [3]:
with open('pickle/processed_paper.pkl', 'rb') as f:
    processed_paper = pickle.load(f)

## Import words and thesaurus

In [4]:
with open('pickle/words.pkl', 'rb') as f:
    words = pickle.load(f)

In [5]:
with open('pickle/thesaurus.pkl', 'rb') as f:
    thesaurus = pickle.load(f)

## Testing

In [8]:
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

### Test 1. Query: 'pengembangan aplikasi'

In [9]:
# insert query here

init_query = 'pengembangan aplikasi'

#### Without query expansion:

In [10]:
# build tf_idf

vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]
print("Query used: " +' '.join(query))

Query used: kembang aplikasi


In [11]:
# process the query

max_result = []
x = [' '.join(query)]
paper_tfidf = vectorizer.fit_transform(x + processed_paper)
q = paper_tfidf[0]
result = cosine_similarity(paper_tfidf, q)
idx = np.argsort(-result,axis=0).flatten()    
final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass
print("Number of documents returned: " +str(len(new_result)-1)+ ".")
print("Top 5 [document, scores, query]:")
for x in new_result[1:6]:
    print(x)

Number of documents returned: 212.
Top 5 [document, scores, query]:
[151, 0.3491994284664284, ['kembang aplikasi']]
[152, 0.3491994284664284, ['kembang aplikasi']]
[1, 0.23342788528981984, ['kembang aplikasi']]
[2, 0.23342788528981984, ['kembang aplikasi']]
[310, 0.22871003186602673, ['kembang aplikasi']]


In [12]:
# show top 5 results

for x in new_result[1:6]: 
    print('Result', x[0]) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

Result 151
QUERY ['kembang aplikasi']
DEVELOPMENT OF A TOOL TO FACILITATE COMMUNICATION AMONG SOFTWARE DEVELOPERS
Dalam sebuah proyek pengembangan perangkat lunak yang
melibatkan banyak pengembang komunikasi di antara pengembang
sangat penting. Komunikasi dilakukan agar para pengembang dapat
membantu pekerjaan...

Result 152
QUERY ['kembang aplikasi']
DEVELOPMENT OF A TOOL TO FACILITATE COMMUNICATION AMONG SOFTWARE DEVELOPERS
Dalam sebuah proyek pengembangan perangkat lunak yang
melibatkan banyak pengembang komunikasi di antara pengembang
sangat penting. Komunikasi dilakukan agar para pengembang dapat
membantu pekerjaan...

Result 1
QUERY ['kembang aplikasi']
RANCANG BANGUN APLIKASI KOMUNIKASI AUDIO PADA JARINGAN NIRKABEL LOKAL BERBASIS ANDROID MENGGUNAKAN ALGORITMA JOINT CODING RATE CONTROL
Perkembangan aplikasi Android dewasa ini sangat pesat aplikasi dikembangkan untuk menyelesaikan permasalahan diberbagai bidang. Saat ini ada beberapa kategori aplikasi Android yang dikembangkan sep

In [13]:
# save results to 'result/'

file = []
for x in new_result[1:]:
    temp = []
    temp.append('Document: ' +str(x[0]))
    temp.append('Query: ' +x[2][0])
    temp.append('Title: ' +paper[x[0]-1][1])
    temp.append(paper[x[0]-1][2])
    file.append(temp)

print("Saving result to result/" +init_query+ "_original.xlsx..")
df = pd.DataFrame(file)
df.to_excel('result/' +init_query+ '_original.xlsx', header=False, index=False)
print("Success.")

Saving result to result/pengembangan aplikasi_original.xlsx..
Success.


#### With query expansion:

In [14]:
# build tf-idf

vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]

In [15]:
# generate query expansion

product_query = []
list_synonym = []
for x in query:
    if x in words:
        list_synonym.append(thesaurus[x])
    else:
        name = x
        data = { "q": name }
        encoded_data = urllib.parse.urlencode(data).encode("utf-8")
        content = urllib.request.urlopen("http://www.sinonimkata.com/search.php", encoded_data)
        soup = BeautifulSoup(content, 'html.parser')
        try:
            synonym = soup.find('td', attrs={'width': '90%'}).find_all('a')
            synonym = [x.getText() for x in synonym]
            thesaurus[x] = [x] + synonym
            list_synonym.append(thesaurus[x])
        except:
            list_synonym.append([x])
qs = []
for x in itertools.product(*list_synonym):
    x = [stemmer.stem(y) for y in x]
    qs.append([' '.join(x)])
print("Queries used:")
for x in qs:
    print("-", x[0])

Queries used:
- kembang aplikasi
- kembang operasi
- kembang laksana
- kembang terap
- kembang guna
- kembang praktik
- kembang rekayasa
- kembang lamar
- kembang minta
- kembang mohon
- bunga aplikasi
- bunga operasi
- bunga laksana
- bunga terap
- bunga guna
- bunga praktik
- bunga rekayasa
- bunga lamar
- bunga minta
- bunga mohon
- kesuma aplikasi
- kesuma operasi
- kesuma laksana
- kesuma terap
- kesuma guna
- kesuma praktik
- kesuma rekayasa
- kesuma lamar
- kesuma minta
- kesuma mohon
- kusuma aplikasi
- kusuma operasi
- kusuma laksana
- kusuma terap
- kusuma guna
- kusuma praktik
- kusuma rekayasa
- kusuma lamar
- kusuma minta
- kusuma mohon
- puspa aplikasi
- puspa operasi
- puspa laksana
- puspa terap
- puspa guna
- puspa praktik
- puspa rekayasa
- puspa lamar
- puspa minta
- puspa mohon
- puspita aplikasi
- puspita operasi
- puspita laksana
- puspita terap
- puspita guna
- puspita praktik
- puspita rekayasa
- puspita lamar
- puspita minta
- puspita mohon
- sari aplikasi
- sa

In [16]:
# process the query

max_result = []
for x in qs:
    paper_tfidf = vectorizer.fit_transform(x + processed_paper)
    q = paper_tfidf[0]
    result = cosine_similarity(paper_tfidf, q)
    idx = np.argsort(-result,axis=0).flatten()    
    final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
    max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass
print("Number of documents returned: " +str(len(new_result)-1)+ ".")
print("Top 5 [document, scores, query]:")
for x in new_result[1:6]:
    print(x)

Number of documents returned: 316.
Top 5 [document, scores, query]:
[151, 0.39806499910346327, ['kembang guna']]
[152, 0.39806499910346327, ['kembang guna']]
[310, 0.2640909710096908, ['kembang guna']]
[311, 0.2640909710096908, ['kembang guna']]
[171, 0.26281909713892165, ['kembang rekayasa']]


In [17]:
# show top 5 results

for x in new_result[1:6]: 
    print('Result', x[0]) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

Result 151
QUERY ['kembang guna']
DEVELOPMENT OF A TOOL TO FACILITATE COMMUNICATION AMONG SOFTWARE DEVELOPERS
Dalam sebuah proyek pengembangan perangkat lunak yang
melibatkan banyak pengembang komunikasi di antara pengembang
sangat penting. Komunikasi dilakukan agar para pengembang dapat
membantu pekerjaan...

Result 152
QUERY ['kembang guna']
DEVELOPMENT OF A TOOL TO FACILITATE COMMUNICATION AMONG SOFTWARE DEVELOPERS
Dalam sebuah proyek pengembangan perangkat lunak yang
melibatkan banyak pengembang komunikasi di antara pengembang
sangat penting. Komunikasi dilakukan agar para pengembang dapat
membantu pekerjaan...

Result 310
QUERY ['kembang guna']
CONTROLLING ROBOT BASED ON IP INTERNET
PROTOCOL THROUGH THE WIRELESS NETWORK
WITH ANDROID MOBILE DEVICE
Pengembangan teknologi informasi pada saat ini telah
mengalami kemajuan yang sangat pesat khususnya dalam
penelitian mengenai sistem jaringan komputer. Salah satu contoh
hasil pengembangan teknolog...

Result 311
QUERY ['kembang guna']
CO

In [18]:
# save results to 'result/'

file = []
for x in new_result[1:]:
    temp = []
    temp.append('Document: ' +str(x[0]))
    temp.append('Query: ' +x[2][0])
    temp.append('Title: ' +paper[x[0]-1][1])
    temp.append(paper[x[0]-1][2])
    file.append(temp)

print("Saving result to result/" +init_query+ "_expansion.xlsx..")
df = pd.DataFrame(file)
df.to_excel('result/' +init_query+ '_expansion.xlsx', header=False, index=False)
print("Success.")

Saving result to result/pengembangan aplikasi_expansion.xlsx..
Success.


### Test 2. Query: 'pengolahan dokumen'

In [19]:
# insert query here

init_query = 'pengolahan dokumen'

#### Without query expansion:

In [27]:
# build tf_idf

vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]
print("Query used: " +' '.join(query))

Query used: olah dokumen


In [28]:
# process the query

max_result = []
x = [' '.join(query)]
paper_tfidf = vectorizer.fit_transform(x + processed_paper)
q = paper_tfidf[0]
result = cosine_similarity(paper_tfidf, q)
idx = np.argsort(-result,axis=0).flatten()    
final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass
print("Number of documents returned: " +str(len(new_result)-1)+ ".")
print("Top 5 [document, scores, query]:")
for x in new_result[1:6]:
    print(x)

Number of documents returned: 57.
Top 5 [document, scores, query]:
[143, 0.633286245058755, ['olah dokumen']]
[144, 0.633286245058755, ['olah dokumen']]
[111, 0.506933668858255, ['olah dokumen']]
[112, 0.506933668858255, ['olah dokumen']]
[14, 0.5045176176751359, ['olah dokumen']]


In [29]:
# show top 5 results

for x in new_result[1:6]: 
    print('Result', x[0]) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

Result 143
QUERY ['olah dokumen']
DESIGN OF DOCUMENT GROUPING MODULE IN ADMINISTRATION INFORMATION SYSTEM MANAGEMENTRANCANG
Saat ini sudah banyak sekali dokumen-dokumen yang
dipublikasikan di internet maupun media lainnya. Dokumen
merupakan salah satu produk yang dihasilkan dalam sebuah
tatanan organisasi mulai dari per...

Result 144
QUERY ['olah dokumen']
DESIGN OF DOCUMENT GROUPING MODULE IN ADMINISTRATION INFORMATION SYSTEM MANAGEMENTRANCANG
Saat ini sudah banyak sekali dokumen-dokumen yang
dipublikasikan di internet maupun media lainnya. Dokumen
merupakan salah satu produk yang dihasilkan dalam sebuah
tatanan organisasi mulai dari per...

Result 111
QUERY ['olah dokumen']
DESIGN PF DOCUMENT TRACKING SYSTEM UTILIZING FUSE AND SAMBA FILE-SYSTEM CASE STUDY IN PT. ANEKA TUNA INDONESIA
Saat ini dokumen digital merupakan aspek penting dalam dunia bisnis. Dokumen digital memuat data-data penting
perusahaan baik berupa data produksi sampai data keuangan. Dokumen digital dalam perusaha...


In [30]:
# save results to 'result/'

file = []
for x in new_result[1:]:
    temp = []
    temp.append('Document: ' +str(x[0]))
    temp.append('Query: ' +x[2][0])
    temp.append('Title: ' +paper[x[0]-1][1])
    temp.append(paper[x[0]-1][2])
    file.append(temp)

print("Saving result to result/" +init_query+ "_original.xlsx..")
df = pd.DataFrame(file)
df.to_excel('result/' +init_query+ '_original.xlsx', header=False, index=False)
print("Success.")

Saving result to result/pengolahan dokumen_original.xlsx..
Success.


#### With query expansion:

In [31]:
# build tf-idf

vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]

In [32]:
# generate query expansion

product_query = []
list_synonym = []
for x in query:
    if x in words:
        list_synonym.append(thesaurus[x])
    else:
        name = x
        data = { "q": name }
        encoded_data = urllib.parse.urlencode(data).encode("utf-8")
        content = urllib.request.urlopen("http://www.sinonimkata.com/search.php", encoded_data)
        soup = BeautifulSoup(content, 'html.parser')
        try:
            synonym = soup.find('td', attrs={'width': '90%'}).find_all('a')
            synonym = [x.getText() for x in synonym]
            thesaurus[x] = [x] + synonym
            list_synonym.append(thesaurus[x])
        except:
            list_synonym.append([x])
qs = []
for x in itertools.product(*list_synonym):
    x = [stemmer.stem(y) for y in x]
    qs.append([' '.join(x)])
print("Queries used:")
for x in qs:
    print("-", x[0])

Queries used:
- olah dokumen
- olah akta
- olah arsip
- olah inskripsi
- olah kopi
- olah manuskrip
- olah tinggal
- olah piagam
- olah sahifah
- olah salin
- olah sertifikat
- olah surat
- olah tembus
- olah tindas
- akal dokumen
- akal akta
- akal arsip
- akal inskripsi
- akal kopi
- akal manuskrip
- akal tinggal
- akal piagam
- akal sahifah
- akal salin
- akal sertifikat
- akal surat
- akal tembus
- akal tindas
- canda dokumen
- canda akta
- canda arsip
- canda inskripsi
- canda kopi
- canda manuskrip
- canda tinggal
- canda piagam
- canda sahifah
- canda salin
- canda sertifikat
- canda surat
- canda tembus
- canda tindas
- cara dokumen
- cara akta
- cara arsip
- cara inskripsi
- cara kopi
- cara manuskrip
- cara tinggal
- cara piagam
- cara sahifah
- cara salin
- cara sertifikat
- cara surat
- cara tembus
- cara tindas
- daya dokumen
- daya akta
- daya arsip
- daya inskripsi
- daya kopi
- daya manuskrip
- daya tinggal
- daya piagam
- daya sahifah
- daya salin
- daya sertifikat
- d

In [33]:
# process the query

max_result = []
for x in qs:
    paper_tfidf = vectorizer.fit_transform(x + processed_paper)
    q = paper_tfidf[0]
    result = cosine_similarity(paper_tfidf, q)
    idx = np.argsort(-result,axis=0).flatten()    
    final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
    max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass
print("Number of documents returned: " +str(len(new_result)-1)+ ".")
print("Top 5 [document, scores, query]:")
for x in new_result[1:6]:
    print(x)

Number of documents returned: 278.
Top 5 [document, scores, query]:
[143, 0.7204574549151622, ['laku dokumen']]
[144, 0.7204574549151622, ['laku dokumen']]
[111, 0.6042806959241686, ['laku dokumen']]
[112, 0.6042806959241686, ['laku dokumen']]
[14, 0.6014006683199479, ['laku dokumen']]


In [34]:
# show top 5 results

for x in new_result[1:6]: 
    print('Result', x[0]) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

Result 143
QUERY ['laku dokumen']
DESIGN OF DOCUMENT GROUPING MODULE IN ADMINISTRATION INFORMATION SYSTEM MANAGEMENTRANCANG
Saat ini sudah banyak sekali dokumen-dokumen yang
dipublikasikan di internet maupun media lainnya. Dokumen
merupakan salah satu produk yang dihasilkan dalam sebuah
tatanan organisasi mulai dari per...

Result 144
QUERY ['laku dokumen']
DESIGN OF DOCUMENT GROUPING MODULE IN ADMINISTRATION INFORMATION SYSTEM MANAGEMENTRANCANG
Saat ini sudah banyak sekali dokumen-dokumen yang
dipublikasikan di internet maupun media lainnya. Dokumen
merupakan salah satu produk yang dihasilkan dalam sebuah
tatanan organisasi mulai dari per...

Result 111
QUERY ['laku dokumen']
DESIGN PF DOCUMENT TRACKING SYSTEM UTILIZING FUSE AND SAMBA FILE-SYSTEM CASE STUDY IN PT. ANEKA TUNA INDONESIA
Saat ini dokumen digital merupakan aspek penting dalam dunia bisnis. Dokumen digital memuat data-data penting
perusahaan baik berupa data produksi sampai data keuangan. Dokumen digital dalam perusaha...


In [35]:
# save results to 'result/'

file = []
for x in new_result[1:]:
    temp = []
    temp.append('Document: ' +str(x[0]))
    temp.append('Query: ' +x[2][0])
    temp.append('Title: ' +paper[x[0]-1][1])
    temp.append(paper[x[0]-1][2])
    file.append(temp)

print("Saving result to result/" +init_query+ "_expansion.xlsx..")
df = pd.DataFrame(file)
df.to_excel('result/' +init_query+ '_expansion.xlsx', header=False, index=False)
print("Success.")

Saving result to result/pengolahan dokumen_expansion.xlsx..
Success.


### Test 3. Query: 'deteksi kendaraan'

In [38]:
# insert query here

init_query = 'deteksi kendaraan'

#### Without query expansion:

In [39]:
# build tf_idf

vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]
print("Query used: " +' '.join(query))

Query used: deteksi kendara


In [40]:
# process the query

max_result = []
x = [' '.join(query)]
paper_tfidf = vectorizer.fit_transform(x + processed_paper)
q = paper_tfidf[0]
result = cosine_similarity(paper_tfidf, q)
idx = np.argsort(-result,axis=0).flatten()    
final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass
print("Number of documents returned: " +str(len(new_result)-1)+ ".")
print("Top 5 [document, scores, query]:")
for x in new_result[1:6]:
    print(x)

Number of documents returned: 83.
Top 5 [document, scores, query]:
[213, 0.34032856938290673, ['deteksi kendara']]
[214, 0.34032856938290673, ['deteksi kendara']]
[76, 0.2948584016076843, ['deteksi kendara']]
[77, 0.2948584016076843, ['deteksi kendara']]
[179, 0.26116575798253344, ['deteksi kendara']]


In [41]:
# show top 5 results

for x in new_result[1:6]: 
    print('Result', x[0]) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

Result 213
QUERY ['deteksi kendara']
PROTOTYPE OF EMISSION TEST DEVICE FOR VEHICLE USING ARDUINO MICROCONTROLLER
Belakangan ini Dinas Perhubungan kembali menggalakkan pengujian kendaraan bermotor kepada masyarakat agar masyarakat paham dengan kondisi mesin kendaraannya maupun dampak emisinya terhadap lingkungan ...

Result 214
QUERY ['deteksi kendara']
PROTOTYPE OF EMISSION TEST DEVICE FOR VEHICLE USING ARDUINO MICROCONTROLLER
Belakangan ini Dinas Perhubungan kembali menggalakkan pengujian kendaraan bermotor kepada masyarakat agar masyarakat paham dengan kondisi mesin kendaraannya maupun dampak emisinya terhadap lingkungan ...

Result 76
QUERY ['deteksi kendara']
FORECASTING NUMBER OF VEHICLE IN ROAD USING MULTILAYER PERCEPTRON NEURAL NETWORK WITH AND WITHOUT LINEAR REGRESSION
Jumlah kendaraan yang berada kota-kota besar Indonesia semakin lama semakin bertambah tanpa diimbangi oleh
pertambahan jumlah jalan yang sebanding. Setelah beberapa tahun masalah ini tidak ditangani ...

Result 7

In [42]:
# save results to 'result/'

file = []
for x in new_result[1:]:
    temp = []
    temp.append('Document: ' +str(x[0]))
    temp.append('Query: ' +x[2][0])
    temp.append('Title: ' +paper[x[0]-1][1])
    temp.append(paper[x[0]-1][2])
    file.append(temp)

print("Saving result to result/" +init_query+ "_original.xlsx..")
df = pd.DataFrame(file)
df.to_excel('result/' +init_query+ '_original.xlsx', header=False, index=False)
print("Success.")

Saving result to result/deteksi kendaraan_original.xlsx..
Success.


#### With query expansion:

In [43]:
# build tf-idf

vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]

In [44]:
# generate query expansion

product_query = []
list_synonym = []
for x in query:
    if x in words:
        list_synonym.append(thesaurus[x])
    else:
        name = x
        data = { "q": name }
        encoded_data = urllib.parse.urlencode(data).encode("utf-8")
        content = urllib.request.urlopen("http://www.sinonimkata.com/search.php", encoded_data)
        soup = BeautifulSoup(content, 'html.parser')
        try:
            synonym = soup.find('td', attrs={'width': '90%'}).find_all('a')
            synonym = [x.getText() for x in synonym]
            thesaurus[x] = [x] + synonym
            list_synonym.append(thesaurus[x])
        except:
            list_synonym.append([x])
qs = []
for x in itertools.product(*list_synonym):
    x = [stemmer.stem(y) for y in x]
    qs.append([' '.join(x)])
print("Queries used:")
for x in qs:
    print("-", x[0])

Queries used:
- deteksi kendara
- deteksi bawa
- deteksi naik
- deteksi kemudi
- deteksi kendali
- deteksi gandar
- deteksi jalan
- deteksi tumpang
- deteksi tunggang
- deteksi setir
- temu kendara
- temu bawa
- temu naik
- temu kemudi
- temu kendali
- temu gandar
- temu jalan
- temu tumpang
- temu tunggang
- temu setir
- indra kendara
- indra bawa
- indra naik
- indra kemudi
- indra kendali
- indra gandar
- indra jalan
- indra tumpang
- indra tunggang
- indra setir


In [45]:
# process the query

max_result = []
for x in qs:
    paper_tfidf = vectorizer.fit_transform(x + processed_paper)
    q = paper_tfidf[0]
    result = cosine_similarity(paper_tfidf, q)
    idx = np.argsort(-result,axis=0).flatten()    
    final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
    max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass
print("Number of documents returned: " +str(len(new_result)-1)+ ".")
print("Top 5 [document, scores, query]:")
for x in new_result[1:6]:
    print(x)

Number of documents returned: 185.
Top 5 [document, scores, query]:
[213, 0.3413036424835403, ['temu kendara']]
[214, 0.3413036424835403, ['temu kendara']]
[310, 0.3087636172346658, ['deteksi kendali']]
[311, 0.3087636172346658, ['deteksi kendali']]
[28, 0.29679900475608745, ['deteksi naik']]


In [46]:
# show top 5 results

for x in new_result[1:6]: 
    print('Result', x[0]) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

Result 213
QUERY ['temu kendara']
PROTOTYPE OF EMISSION TEST DEVICE FOR VEHICLE USING ARDUINO MICROCONTROLLER
Belakangan ini Dinas Perhubungan kembali menggalakkan pengujian kendaraan bermotor kepada masyarakat agar masyarakat paham dengan kondisi mesin kendaraannya maupun dampak emisinya terhadap lingkungan ...

Result 214
QUERY ['temu kendara']
PROTOTYPE OF EMISSION TEST DEVICE FOR VEHICLE USING ARDUINO MICROCONTROLLER
Belakangan ini Dinas Perhubungan kembali menggalakkan pengujian kendaraan bermotor kepada masyarakat agar masyarakat paham dengan kondisi mesin kendaraannya maupun dampak emisinya terhadap lingkungan ...

Result 310
QUERY ['deteksi kendali']
CONTROLLING ROBOT BASED ON IP INTERNET
PROTOCOL THROUGH THE WIRELESS NETWORK
WITH ANDROID MOBILE DEVICE
Pengembangan teknologi informasi pada saat ini telah
mengalami kemajuan yang sangat pesat khususnya dalam
penelitian mengenai sistem jaringan komputer. Salah satu contoh
hasil pengembangan teknolog...

Result 311
QUERY ['deteksi 

In [47]:
# save results to 'result/'

file = []
for x in new_result[1:]:
    temp = []
    temp.append('Document: ' +str(x[0]))
    temp.append('Query: ' +x[2][0])
    temp.append('Title: ' +paper[x[0]-1][1])
    temp.append(paper[x[0]-1][2])
    file.append(temp)

print("Saving result to result/" +init_query+ "_expansion.xlsx..")
df = pd.DataFrame(file)
df.to_excel('result/' +init_query+ '_expansion.xlsx', header=False, index=False)
print("Success.")

Saving result to result/deteksi kendaraan_expansion.xlsx..
Success.
