In [1]:
from bs4 import BeautifulSoup
import urllib.request
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np
import pickle
import itertools
import pandas as pd

In [2]:
# scrap dari digilib

In [3]:
docs_num = 400

In [4]:
digilib_url = 'http://digilib.its.ac.id/publisher/51100/'

In [5]:
paper = []

In [6]:
for i in range(0,docs_num,20):
    page = urllib.request.urlopen(digilib_url+str(i))
    soup = BeautifulSoup(page, 'html.parser')
    docs = soup.find_all('span', attrs={'class': 'style5'})
    link = []
    for x in docs:
        try:
            link.append(x.find('a').get('href'))
        except:
            pass
    for x in tqdm(link[:20], desc='paper', unit='paper'):
        page = urllib.request.urlopen(x)
        soup = BeautifulSoup(page, 'html.parser')
        try:
            title = soup.find('h2', attrs={'class': 'isi'}).find('i').getText()
            abstract = soup.find('span', attrs={'class': 'teks'}).find('p').getText()
            paper.append([x, title, abstract])
        except:
            pass

paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.39paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:05<00:00,  3.50paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.07paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:07<00:00,  2.79paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.70paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  9.44p

In [7]:
len(paper)

328

In [8]:
with open('paper_ori.pkl', 'wb') as f:
    pickle.dump(paper, f)

In [9]:
# preprocessing

In [10]:
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

In [11]:
words = []
processed_paper = []

In [12]:
for x in tqdm(paper, desc='paper', unit='paper'):
    text = x[2]
    text = text.lower()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    text = text.translate(remove_punctuation_map)
    text = stopword.remove(text)
    text = text.split()
    text = [stemmer.stem(x) for x in text]
    processed_paper.append(' '.join(text))
    text = list(set(text))
    words += text

paper: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 328/328 [03:28<00:00,  1.57paper/s]


In [13]:
len(words)

28648

In [14]:
with open('processed_paper.pkl', 'wb') as f:
    pickle.dump(processed_paper, f)

In [15]:
# generate thesaurus

In [16]:
thesaurus = {}

In [17]:
words = list(set(words))

In [18]:
for x in tqdm(words, desc='word', unit='word'):
    name = x
    data = { "q": name }
    encoded_data = urllib.parse.urlencode(data).encode("utf-8")
    content = urllib.request.urlopen("http://www.sinonimkata.com/search.php", encoded_data)
    soup = BeautifulSoup(content, 'html.parser')
    try:
        synonym = soup.find('td', attrs={'width': '90%'}).find_all('a')
        synonym = [x.getText() for x in synonym]
        thesaurus[x] = [x] + synonym
    except:
        thesaurus[x] = [name]

word: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3035/3035 [44:06<00:00,  1.15word/s]


In [19]:
with open('thesaurus.pkl', 'wb') as f:
    pickle.dump(thesaurus, f)

In [20]:
# input query

In [21]:
init_query = 'pengembangan aplikasi'

In [22]:
# tf_idf

In [23]:
vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]
query

['kembang', 'aplikasi']

In [24]:
max_result = []
x = [' '.join(query)]
paper_tfidf = vectorizer.fit_transform(x + processed_paper)
q = paper_tfidf[0]
result = cosine_similarity(paper_tfidf, q)
idx = np.argsort(-result,axis=0).flatten()    
final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass
new_result

[[0, 1.0, ['kembang aplikasi']],
 [151, 0.3491994284664284, ['kembang aplikasi']],
 [152, 0.3491994284664284, ['kembang aplikasi']],
 [1, 0.23342788528981984, ['kembang aplikasi']],
 [2, 0.23342788528981984, ['kembang aplikasi']],
 [310, 0.22871003186602673, ['kembang aplikasi']],
 [311, 0.22871003186602673, ['kembang aplikasi']],
 [105, 0.2284184757189848, ['kembang aplikasi']],
 [106, 0.2284184757189848, ['kembang aplikasi']],
 [107, 0.2138000317050487, ['kembang aplikasi']],
 [108, 0.2138000317050487, ['kembang aplikasi']],
 [327, 0.21062707873244524, ['kembang aplikasi']],
 [328, 0.21062707873244524, ['kembang aplikasi']],
 [94, 0.20206441208785525, ['kembang aplikasi']],
 [248, 0.17832399390180803, ['kembang aplikasi']],
 [249, 0.17832399390180803, ['kembang aplikasi']],
 [163, 0.17290006551391549, ['kembang aplikasi']],
 [164, 0.17290006551391549, ['kembang aplikasi']],
 [238, 0.16919977141793796, ['kembang aplikasi']],
 [239, 0.16919977141793796, ['kembang aplikasi']],
 [49, 0.1

In [25]:
for x in new_result[1:5]: 
    print('Result', x[0]) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

Result 151
QUERY ['kembang aplikasi']
DEVELOPMENT OF A TOOL TO FACILITATE COMMUNICATION AMONG SOFTWARE DEVELOPERS
Dalam sebuah proyek pengembangan perangkat lunak yang
melibatkan banyak pengembang komunikasi di antara pengembang
sangat penting. Komunikasi dilakukan agar para pengembang dapat
membantu pekerjaan...

Result 152
QUERY ['kembang aplikasi']
DEVELOPMENT OF A TOOL TO FACILITATE COMMUNICATION AMONG SOFTWARE DEVELOPERS
Dalam sebuah proyek pengembangan perangkat lunak yang
melibatkan banyak pengembang komunikasi di antara pengembang
sangat penting. Komunikasi dilakukan agar para pengembang dapat
membantu pekerjaan...

Result 1
QUERY ['kembang aplikasi']
RANCANG BANGUN APLIKASI KOMUNIKASI AUDIO PADA JARINGAN NIRKABEL LOKAL BERBASIS ANDROID MENGGUNAKAN ALGORITMA JOINT CODING RATE CONTROL
Perkembangan aplikasi Android dewasa ini sangat pesat aplikasi dikembangkan untuk menyelesaikan permasalahan diberbagai bidang. Saat ini ada beberapa kategori aplikasi Android yang dikembangkan sep

In [26]:
idx_res = [x[0]-1 for x in new_result[1:]]
file = []
for i,x in enumerate(paper):
    if i in idx_res:
        file.append([x[1],x[2],'ok_ok'])
    else:
        file.append([x[1],x[2],''])
df = pd.DataFrame(file)
df.to_excel('hasil ori ' +init_query+ '.xlsx', header=False, index=False)

In [27]:
# query_expansion

In [28]:
vectorizer = TfidfVectorizer(use_idf=True)
query = init_query
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]
query

['kembang', 'aplikasi']

In [29]:
product_query = []
list_synonym = []
for x in query:
    if x in words:
        list_synonym.append(thesaurus[x])
    else:
        name = x
        data = { "q": name }
        encoded_data = urllib.parse.urlencode(data).encode("utf-8")
        content = urllib.request.urlopen("http://www.sinonimkata.com/search.php", encoded_data)
        soup = BeautifulSoup(content, 'html.parser')
        try:
            synonym = soup.find('td', attrs={'width': '90%'}).find_all('a')
            synonym = [x.getText() for x in synonym]
            thesaurus[x] = [x] + synonym
            list_synonym.append(thesaurus[x])
        except:
            list_synonym.append([x])
qs = []
for x in itertools.product(*list_synonym):
    x = [stemmer.stem(y) for y in x]
    qs.append([' '.join(x)])
for x in qs:
    print(x, end=' ')

['kembang aplikasi'] ['kembang operasi'] ['kembang laksana'] ['kembang terap'] ['kembang guna'] ['kembang praktik'] ['kembang rekayasa'] ['kembang lamar'] ['kembang minta'] ['kembang mohon'] ['bunga aplikasi'] ['bunga operasi'] ['bunga laksana'] ['bunga terap'] ['bunga guna'] ['bunga praktik'] ['bunga rekayasa'] ['bunga lamar'] ['bunga minta'] ['bunga mohon'] ['kesuma aplikasi'] ['kesuma operasi'] ['kesuma laksana'] ['kesuma terap'] ['kesuma guna'] ['kesuma praktik'] ['kesuma rekayasa'] ['kesuma lamar'] ['kesuma minta'] ['kesuma mohon'] ['kusuma aplikasi'] ['kusuma operasi'] ['kusuma laksana'] ['kusuma terap'] ['kusuma guna'] ['kusuma praktik'] ['kusuma rekayasa'] ['kusuma lamar'] ['kusuma minta'] ['kusuma mohon'] ['puspa aplikasi'] ['puspa operasi'] ['puspa laksana'] ['puspa terap'] ['puspa guna'] ['puspa praktik'] ['puspa rekayasa'] ['puspa lamar'] ['puspa minta'] ['puspa mohon'] ['puspita aplikasi'] ['puspita operasi'] ['puspita laksana'] ['puspita terap'] ['puspita guna'] ['puspita

In [30]:
max_result = []
for x in qs:
    paper_tfidf = vectorizer.fit_transform(x + processed_paper)
    q = paper_tfidf[0]
    result = cosine_similarity(paper_tfidf, q)
    idx = np.argsort(-result,axis=0).flatten()    
    final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
    max_result += final
max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
set_result = set()
new_result = []
for item in max_result:
    if item[0] not in set_result:
        set_result.add(item[0])
        new_result.append(item)
    else:
        pass
new_result

[[0, 1.0000000000000002, ['kembang lamar']],
 [151, 0.39806499910346327, ['kembang guna']],
 [152, 0.39806499910346327, ['kembang guna']],
 [310, 0.2640909710096908, ['kembang guna']],
 [311, 0.2640909710096908, ['kembang guna']],
 [171, 0.26281909713892165, ['kembang rekayasa']],
 [172, 0.26281909713892165, ['kembang rekayasa']],
 [61, 0.23826645914020597, ['kembang rekayasa']],
 [62, 0.23826645914020597, ['kembang rekayasa']],
 [1, 0.23342788528981984, ['kembang aplikasi']],
 [2, 0.23342788528981984, ['kembang aplikasi']],
 [105, 0.2284184757189848, ['kembang aplikasi']],
 [106, 0.2284184757189848, ['kembang aplikasi']],
 [145, 0.21765653528701104, ['kembang minta']],
 [146, 0.21765653528701104, ['kembang minta']],
 [107, 0.2138000317050487, ['kembang aplikasi']],
 [108, 0.2138000317050487, ['kembang aplikasi']],
 [327, 0.21062707873244524, ['kembang aplikasi']],
 [328, 0.21062707873244524, ['kembang aplikasi']],
 [94, 0.20206441208785525, ['kembang aplikasi']],
 [248, 0.178323993901

In [31]:
for x in new_result[1:5]: 
    print('Result', x[0]+1) 
    print('QUERY', x[2]) 
    print(paper[x[0]-1][1]) 
    print(paper[x[0]-1][2][:200] + '...')
    print()

Result 152
QUERY ['kembang guna']
DEVELOPMENT OF A TOOL TO FACILITATE COMMUNICATION AMONG SOFTWARE DEVELOPERS
Dalam sebuah proyek pengembangan perangkat lunak yang
melibatkan banyak pengembang komunikasi di antara pengembang
sangat penting. Komunikasi dilakukan agar para pengembang dapat
membantu pekerjaan...

Result 153
QUERY ['kembang guna']
DEVELOPMENT OF A TOOL TO FACILITATE COMMUNICATION AMONG SOFTWARE DEVELOPERS
Dalam sebuah proyek pengembangan perangkat lunak yang
melibatkan banyak pengembang komunikasi di antara pengembang
sangat penting. Komunikasi dilakukan agar para pengembang dapat
membantu pekerjaan...

Result 311
QUERY ['kembang guna']
CONTROLLING ROBOT BASED ON IP INTERNET
PROTOCOL THROUGH THE WIRELESS NETWORK
WITH ANDROID MOBILE DEVICE
Pengembangan teknologi informasi pada saat ini telah
mengalami kemajuan yang sangat pesat khususnya dalam
penelitian mengenai sistem jaringan komputer. Salah satu contoh
hasil pengembangan teknolog...

Result 312
QUERY ['kembang guna']
CO

In [32]:
idx_res = [x[0]-1 for x in new_result[1:]]
file = []
for i,x in enumerate(paper):
    if i in idx_res:
        file.append([x[1],x[2],'ok_ok'])
    else:
        file.append([x[1],x[2],''])
df = pd.DataFrame(file)
df.to_excel('hasil expansion ' +init_query+ '.xlsx', header=False, index=False)