# Import Library

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA
import pandas as pd
import string 
from openpyxl import load_workbook

# Custom Libraries

In [2]:
class Engine:
    def __init__(self):
        self.cosine_score = []
        self.train_set = []  # Documents
        self.test_set = []  # Query

    def addDocument(self, word):
        self.train_set.append(word)

    def setQuery(self, word):
        self.test_set.append(word)

    def process_score(self):
        stopWords = stopwords.words('english')
        vectorizer = CountVectorizer()

        transformer = TfidfTransformer()

        trainVectorizerArray = vectorizer.fit_transform(self.train_set).toarray()
        testVectorizerArray = vectorizer.transform(self.test_set).toarray()

        cx = lambda a, b: round(np.inner(a, b) / (LA.norm(a) * LA.norm(b)), 3)
        # print testVectorizerArray
        output = []
        for i in range(0, len(testVectorizerArray)):
            output.append([])

        for vector in trainVectorizerArray:
            # print vector
            u = 0
            for testV in testVectorizerArray:
                # print testV
                cosine = cx(vector, testV)
                # self.cosine_score.append(cosine)
                # bulatin = (round(cosine),2)
                output[u].append((cosine))
                u = u + 1
        return output
        # return testVectorizerArray
    
    def check_tag(self,tag,tags):
        # data_tag = tag.split(',')
        # data_tags = tags.split(',')
        stat = False
        if tag in tags:
            stat = True
        return stat

# Load Dataset Khotbah

In [3]:
data = pd.read_excel('preprocessed-dataset.xlsx', sep=',', encoding='latin')
dt = data['preprocessed']
dt

0      keliru masalah syarat sah shalat ibadallah sun...
1      allah maha tahu apa kerja khotib wasiat diri k...
2      duduk ulama tengah masyarakat antara nikmat al...
3      pimpin sesat ibadallah hidup zaman penuh fitna...
4      dzikir ingat allah kuat jiwa raga sungguh oran...
5      banding nikmat dunia nikmat akhirat hadirin ra...
6      rahasia baca surat al fatihah ibadallah takwa ...
7      butuh taubat ibadallah khotib wasiat diri khot...
8      utama taubat istighfar ibadallah mari takwa al...
9      metadabburi alquran surat yunus khotib wasiat ...
10     jaga waktu kaum muslimin khotib wasiat diri kh...
11     hal tunjuk kuat imam ibadallah khotib wasiat d...
12     khotbah masjid haram utama syukur jadi orang s...
13     khawarij kontemporer ibadallah abad ini sunggu...
14     sejarah awal mula ronta islam ibadallah catat ...
15     cara allah laku orang taat dosa takwa allah be...
16     mimpi pandang islam ayyuhal mukminun takwa all...
17     utama ilmu agama kiat da

# Load Dataset Khotbah Query

In [8]:
queri = pd.read_excel('trend.xlsx', sep=',', encoding='latin')
queries = queri['trend'][10:11]
queries

10    indonesia memilih
Name: trend, dtype: object

# TF IDF dan Cosine Similarity

In [9]:
engine = Engine()

list_dokumenkhotbah = [str(x) for x in data['preprocessed']]

columnNames = []

for i, doc in enumerate(list_dokumenkhotbah):
    engine.addDocument(doc)
    columnNames.append("Document_{}".format(i+1))
    
for query in queries:
    engine.setQuery(query)
    
doc_score = engine.process_score()
docScoreDf = pd.DataFrame(doc_score).T
docScoreDf.columns = queries
docScoreDf["Documents"] = columnNames
docScoreDf["Link"] = data["Link"].values
docScoreDf

trend,indonesia memilih,Documents,Link
0,0.000,Document_1,https://khotbahjumat.com/5254-kekeliruan-dalam...
1,0.000,Document_2,https://khotbahjumat.com/5251-allah-maha-menge...
2,0.000,Document_3,https://khotbahjumat.com/5232-kedudukan-ulama-...
3,0.000,Document_4,https://khotbahjumat.com/5223-pemimpin-yang-me...
4,0.000,Document_5,https://khotbahjumat.com/5213-berdzikir-mengin...
5,0.000,Document_6,https://khotbahjumat.com/5190-perbandingan-nik...
6,0.000,Document_7,https://khotbahjumat.com/5179-rahasia-dalam-ba...
7,0.000,Document_8,https://khotbahjumat.com/5176-kita-butuh-berta...
8,0.000,Document_9,https://khotbahjumat.com/5173-keutamaan-taubat...
9,0.000,Document_10,https://khotbahjumat.com/5156-menadabburi-alqu...


# Label Relevan

In [10]:
df_listed = []
for i in queries:
    labels = list()
    for j in docScoreDf[i]:
        if j>0.000:
            labels.append(1)
        else:
            labels.append(0)
    datadf = pd.DataFrame(docScoreDf[i])
    datadf['Documents'] = docScoreDf['Documents']
    datadf['Label'] = labels
    datadf['Link'] = docScoreDf["Link"].values
    df_listed.append(datadf.sort_values(by=[i], ascending=False))
df_listed

[     indonesia memilih     Documents  Label  \
 287              0.130  Document_288      1   
 325              0.084  Document_326      1   
 346              0.066  Document_347      1   
 122              0.060  Document_123      1   
 336              0.059  Document_337      1   
 310              0.051  Document_311      1   
 246              0.045  Document_247      1   
 343              0.041  Document_344      1   
 193              0.035  Document_194      1   
 329              0.031  Document_330      1   
 348              0.030  Document_349      1   
 303              0.027  Document_304      1   
 335              0.026  Document_336      1   
 125              0.024  Document_126      1   
 357              0.021  Document_358      1   
 345              0.021  Document_346      1   
 347              0.018  Document_348      1   
 316              0.018  Document_317      1   
 314              0.017  Document_315      1   
 315              0.016  Document_316   