In [1]:
from bs4 import BeautifulSoup
import urllib.request
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np

In [3]:
import itertools

In [4]:
# scrap dari digilib

In [5]:
docs_num = 400

In [6]:
digilib_url = 'http://digilib.its.ac.id/publisher/51100/'

In [7]:
paper = []

In [8]:
for i in range(0,docs_num,20):
    page = urllib.request.urlopen(digilib_url)
    soup = BeautifulSoup(page, 'html.parser')
    docs = soup.find_all('span', attrs={'class': 'style5'})
    link = []
    for x in docs:
        try:
            link.append(x.find('a').get('href'))
        except:
            pass
    for x in tqdm(link[:20], desc='paper', unit='paper'):
        page = urllib.request.urlopen(x)
        soup = BeautifulSoup(page, 'html.parser')
        try:
            title = soup.find('h2', attrs={'class': 'isi'}).find('i').getText()
            abstract = soup.find('span', attrs={'class': 'teks'}).find('p').getText()
            paper.append([x, title, abstract])
        except:
            pass

paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  7.20paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:05<00:00,  3.89paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  7.12paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  7.15paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  7.00paper/s]
paper: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:05<00:00,  3.87p

In [9]:
len(paper)

360

In [10]:
# preprocessing

In [11]:
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

In [12]:
words = []
processed_paper = []

In [13]:
for x in tqdm(paper, desc='paper', unit='paper'):
    text = x[2]
    text = text.lower()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    text = text.translate(remove_punctuation_map)
    text = stopword.remove(text)
    text = text.split()
    text = [stemmer.stem(x) for x in text]
    processed_paper.append(' '.join(text))
    text = list(set(text))
    words += text

paper: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [00:22<00:00, 15.85paper/s]


In [14]:
len(words)

29320

In [15]:
# generate thesaurus

In [16]:
thesaurus = {}

In [17]:
words = list(set(words))

In [18]:
for x in tqdm(words, desc='word', unit='word'):
    name = x
    data = { "q": name }
    encoded_data = urllib.parse.urlencode(data).encode("utf-8")
    content = urllib.request.urlopen("http://www.sinonimkata.com/search.php", encoded_data)
    soup = BeautifulSoup(content, 'html.parser')
    try:
        synonym = soup.find('td', attrs={'width': '90%'}).find_all('a')
        synonym = [x.getText() for x in synonym]
        thesaurus[x] = [x] + synonym
    except:
        thesaurus[x] = [name]

word: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 502/502 [07:22<00:00,  1.14word/s]


In [19]:
# tf_idf

In [20]:
vectorizer = TfidfVectorizer(use_idf=True)

In [38]:
query = 'pengenalan kendaraan'

In [39]:
query = query.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
query = query.translate(remove_punctuation_map)
query = stopword.remove(query)
query = query.split()
query = [stemmer.stem(x) for x in query]

In [40]:
query

['kenal', 'kendara']

In [41]:
product_query = []
list_synonym = []

In [42]:
for x in query:
    if x in words:
        list_synonym.append(thesaurus[x])
    else:
        name = x
        data = { "q": name }
        encoded_data = urllib.parse.urlencode(data).encode("utf-8")
        content = urllib.request.urlopen("http://www.sinonimkata.com/search.php", encoded_data)
        soup = BeautifulSoup(content, 'html.parser')
        try:
            synonym = soup.find('td', attrs={'width': '90%'}).find_all('a')
            synonym = [x.getText() for x in synonym]
            thesaurus[x] = [x] + synonym
            list_synonym.append(thesaurus[x])
        except:
            list_synonym.append([x])

In [56]:
max_score = 0
max_q = ''
max_result = 0

In [57]:
for x in itertools.product(*list_synonym):
    x = [stemmer.stem(y) for y in x]
    q = [' '.join(x)]
    paper_tfidf = vectorizer.fit_transform(q + processed_paper)
    q = paper_tfidf[0]
    result = cosine_similarity(paper_tfidf, q)
    idx = np.argsort(-result,axis=0).flatten()
    final = [r for r in result if r[0] > 0.0]
    if len(final) >= max_result:
        if len(final) == max_result:
            if result[idx[1]][0] > max_score:
                max_score = result[idx[1]][0]
                max_q = [' '.join(x)]
                max_result = len(final)
        else:
            max_score = result[idx[1]][0]
            max_q = [' '.join(x)]
            max_result = len(final)

In [58]:
print(max_result, max_score, max_q)

61 0.021086044691723642 ['kenal kendali']


In [59]:
paper_tfidf = vectorizer.fit_transform(max_q + processed_paper) 
max_q = paper_tfidf[0]
result = cosine_similarity(paper_tfidf, max_q)
idx = np.argsort(-result,axis=0)[1:6].flatten()
for x in idx:
    print('Paper', x, 'score :', result[x][0])

Paper 72 score : 0.021086044691723642
Paper 90 score : 0.021086044691723642
Paper 108 score : 0.021086044691723642
Paper 126 score : 0.021086044691723642
Paper 144 score : 0.021086044691723642


In [60]:
for x in range(len(idx)): 
    print('Result', x+1) 
    print(paper[idx[x]-1][1]) 
    print(paper[idx[x]-1][2])
    print()

Result 1
IMPLEMENTASI MODUL HOUSE EDITOR DAN FITUR SOSIAL PADA APLIKASI PERMAINAN SOSIAL CARD WARLOCK SAGA PADA PERANGKAT ANDROID
Card Warlock Saga adalah aplikasi permainan sosial yang juga
mengandung unsur RPG turn-based game dan card game sehingga bisa
dikatakan permainan ini bergenre turn-based collectible card yaitu bertema
turn-based yang menggunakan kartu sebagai mekanisme pertarungan
permainan. Pemain memiliki satu hero yang dikendalikan dan set kartu
sebagai mekanisme pertarungan.
Dikarenakan permainan ini merupakan permainan sosial maka
permainan ini dibuat agar antar pemain dapat terjadi interaksi sehingga
diimplementasikanlah fitur-fitur yang mendukung interaksi antar pemain
seperti menambah teman saling bertukar barang dan mengirim pesan. Dan
permainan ini akan diintegrasikan dengan jejaring sosial Facebook.
Integrasi antara aplikasi permainan ini dengan jejaring sosial
Facebook akan dibuat menggunakan kakas Facebook SDK for Unity dengan
menggunakan layanan Facebook API. D