In [1]:
# This is an implementation of TF-IDF based on the following link
# https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
# If you failed to access the web page, you might access the pdf file in the following link
# https://drive.google.com/file/d/1J7J6p8hZ3lDmncxq1f6HHNstaGbtREHj/view?usp=sharing

import pandas as pd

In [2]:
# Simple documents
# Change the content, so you can have some intuition about TF-IDF

documentA = 'Bukan aku yang mencarimu Bukan kamu yang mencari aku Cinta yang mempertemukan Dua hati yang berbeda ini'
documentB = 'Saat kau terlalu rapuh Pundak siapa yang tersandar Tangan siapa yang tak melepas Ku yakin aku'
documentC = 'Aku yang minta maaf walau kau yang salah Aku kan menahan walau kau ingin pisah Karna kamu penting Lebih penting Dari semua yang ku punya'

# documentA = 'The TF-IDF are than obtained by multiplying the TF of each document (item) with the IDF which applicable globally'
# documentB = 'The TF-IDF values represent the vector of each item'
# documentC = 'By using the cosine similarity or other distance techniques we can calculate similar items, or group the items into several clusters'
# documentD = 'Therefore, we can use TF-IDF as an algorithm in Content Based Filtering'

# split each document
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
bagOfWordsC = documentC.split(' ')

print(bagOfWordsA)
print(bagOfWordsB)
print(bagOfWordsC)

['Bukan', 'aku', 'yang', 'mencarimu', 'Bukan', 'kamu', 'yang', 'mencari', 'aku', 'Cinta', 'yang', 'mempertemukan', 'Dua', 'hati', 'yang', 'berbeda', 'ini']
['Saat', 'kau', 'terlalu', 'rapuh', 'Pundak', 'siapa', 'yang', 'tersandar', 'Tangan', 'siapa', 'yang', 'tak', 'melepas', 'Ku', 'yakin', 'aku']
['Aku', 'yang', 'minta', 'maaf', 'walau', 'kau', 'yang', 'salah', 'Aku', 'kan', 'menahan', 'walau', 'kau', 'ingin', 'pisah', 'Karna', 'kamu', 'penting', 'Lebih', 'penting', 'Dari', 'semua', 'yang', 'ku', 'punya']


In [3]:
# Find the uniue set of words
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB)).union(set(bagOfWordsC))
print(uniqueWords)

{'berbeda', 'tersandar', 'tak', 'pisah', 'punya', 'ingin', 'Cinta', 'aku', 'yakin', 'Dua', 'mencari', 'kau', 'penting', 'Karna', 'ku', 'Saat', 'maaf', 'kan', 'salah', 'Dari', 'Aku', 'walau', 'menahan', 'mempertemukan', 'minta', 'melepas', 'Pundak', 'yang', 'mencarimu', 'Tangan', 'hati', 'Ku', 'semua', 'ini', 'terlalu', 'Lebih', 'Bukan', 'siapa', 'kamu', 'rapuh'}


In [4]:
# create dictionary for each document and calculate the word frequency in each document

numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
print(numOfWordsA)

numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1
print(numOfWordsB)

numOfWordsC = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsC:
    numOfWordsC[word] += 1
print(numOfWordsC)

{'berbeda': 1, 'tersandar': 0, 'tak': 0, 'pisah': 0, 'punya': 0, 'ingin': 0, 'Cinta': 1, 'aku': 2, 'yakin': 0, 'Dua': 1, 'mencari': 1, 'kau': 0, 'penting': 0, 'Karna': 0, 'ku': 0, 'Saat': 0, 'maaf': 0, 'kan': 0, 'salah': 0, 'Dari': 0, 'Aku': 0, 'walau': 0, 'menahan': 0, 'mempertemukan': 1, 'minta': 0, 'melepas': 0, 'Pundak': 0, 'yang': 4, 'mencarimu': 1, 'Tangan': 0, 'hati': 1, 'Ku': 0, 'semua': 0, 'ini': 1, 'terlalu': 0, 'Lebih': 0, 'Bukan': 2, 'siapa': 0, 'kamu': 1, 'rapuh': 0}
{'berbeda': 0, 'tersandar': 1, 'tak': 1, 'pisah': 0, 'punya': 0, 'ingin': 0, 'Cinta': 0, 'aku': 1, 'yakin': 1, 'Dua': 0, 'mencari': 0, 'kau': 1, 'penting': 0, 'Karna': 0, 'ku': 0, 'Saat': 1, 'maaf': 0, 'kan': 0, 'salah': 0, 'Dari': 0, 'Aku': 0, 'walau': 0, 'menahan': 0, 'mempertemukan': 0, 'minta': 0, 'melepas': 1, 'Pundak': 1, 'yang': 2, 'mencarimu': 0, 'Tangan': 1, 'hati': 0, 'Ku': 1, 'semua': 0, 'ini': 0, 'terlalu': 1, 'Lebih': 0, 'Bukan': 0, 'siapa': 2, 'kamu': 0, 'rapuh': 1}
{'berbeda': 0, 'tersandar': 0,

In [5]:
#importing stopword
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
#stopwords.words('english')
stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<WordListCorpusReader in '/root/nltk_data/corpora/stopwords'>

In [6]:
# Compute the Term Frequency
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
tfC = computeTF(numOfWordsC, bagOfWordsC)

print(tfA)
print(tfB)
print(tfC)

{'berbeda': 0.058823529411764705, 'tersandar': 0.0, 'tak': 0.0, 'pisah': 0.0, 'punya': 0.0, 'ingin': 0.0, 'Cinta': 0.058823529411764705, 'aku': 0.11764705882352941, 'yakin': 0.0, 'Dua': 0.058823529411764705, 'mencari': 0.058823529411764705, 'kau': 0.0, 'penting': 0.0, 'Karna': 0.0, 'ku': 0.0, 'Saat': 0.0, 'maaf': 0.0, 'kan': 0.0, 'salah': 0.0, 'Dari': 0.0, 'Aku': 0.0, 'walau': 0.0, 'menahan': 0.0, 'mempertemukan': 0.058823529411764705, 'minta': 0.0, 'melepas': 0.0, 'Pundak': 0.0, 'yang': 0.23529411764705882, 'mencarimu': 0.058823529411764705, 'Tangan': 0.0, 'hati': 0.058823529411764705, 'Ku': 0.0, 'semua': 0.0, 'ini': 0.058823529411764705, 'terlalu': 0.0, 'Lebih': 0.0, 'Bukan': 0.11764705882352941, 'siapa': 0.0, 'kamu': 0.058823529411764705, 'rapuh': 0.0}
{'berbeda': 0.0, 'tersandar': 0.0625, 'tak': 0.0625, 'pisah': 0.0, 'punya': 0.0, 'ingin': 0.0, 'Cinta': 0.0, 'aku': 0.0625, 'yakin': 0.0625, 'Dua': 0.0, 'mencari': 0.0, 'kau': 0.0625, 'penting': 0.0, 'Karna': 0.0, 'ku': 0.0, 'Saat': 0

In [7]:
# Compute the inverse document frequency
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idfs = computeIDF([numOfWordsA, numOfWordsB, numOfWordsC])
print(idfs)

{'berbeda': 1.0986122886681098, 'tersandar': 1.0986122886681098, 'tak': 1.0986122886681098, 'pisah': 1.0986122886681098, 'punya': 1.0986122886681098, 'ingin': 1.0986122886681098, 'Cinta': 1.0986122886681098, 'aku': 0.4054651081081644, 'yakin': 1.0986122886681098, 'Dua': 1.0986122886681098, 'mencari': 1.0986122886681098, 'kau': 0.4054651081081644, 'penting': 1.0986122886681098, 'Karna': 1.0986122886681098, 'ku': 1.0986122886681098, 'Saat': 1.0986122886681098, 'maaf': 1.0986122886681098, 'kan': 1.0986122886681098, 'salah': 1.0986122886681098, 'Dari': 1.0986122886681098, 'Aku': 1.0986122886681098, 'walau': 1.0986122886681098, 'menahan': 1.0986122886681098, 'mempertemukan': 1.0986122886681098, 'minta': 1.0986122886681098, 'melepas': 1.0986122886681098, 'Pundak': 1.0986122886681098, 'yang': 0.0, 'mencarimu': 1.0986122886681098, 'Tangan': 1.0986122886681098, 'hati': 1.0986122886681098, 'Ku': 1.0986122886681098, 'semua': 1.0986122886681098, 'ini': 1.0986122886681098, 'terlalu': 1.098612288668

In [8]:
# Compute the TFxIDF
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
tfidfC = computeTFIDF(tfC, idfs)
df = pd.DataFrame([tfidfA, tfidfB, tfidfC])

df

Unnamed: 0,berbeda,tersandar,tak,pisah,punya,ingin,Cinta,aku,yakin,Dua,mencari,kau,penting,Karna,ku,Saat,maaf,kan,salah,Dari,Aku,walau,menahan,mempertemukan,minta,melepas,Pundak,yang,mencarimu,Tangan,hati,Ku,semua,ini,terlalu,Lebih,Bukan,siapa,kamu,rapuh
0,0.064624,0.0,0.0,0.0,0.0,0.0,0.064624,0.047702,0.0,0.064624,0.064624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064624,0.0,0.0,0.0,0.0,0.064624,0.0,0.064624,0.0,0.0,0.064624,0.0,0.0,0.129249,0.0,0.023851,0.0
1,0.0,0.068663,0.068663,0.0,0.0,0.0,0.0,0.025342,0.068663,0.0,0.0,0.025342,0.0,0.0,0.0,0.068663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068663,0.068663,0.0,0.0,0.068663,0.0,0.068663,0.0,0.0,0.068663,0.0,0.0,0.137327,0.0,0.068663
2,0.0,0.0,0.0,0.043944,0.043944,0.043944,0.0,0.0,0.0,0.0,0.0,0.032437,0.087889,0.043944,0.043944,0.0,0.043944,0.043944,0.043944,0.043944,0.087889,0.087889,0.043944,0.0,0.043944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043944,0.0,0.0,0.043944,0.0,0.0,0.016219,0.0


In [9]:
# Obtaining TF-IDF using sklearn library
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB, documentC])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df

Unnamed: 0,aku,berbeda,bukan,cinta,dari,dua,hati,ingin,ini,kamu,kan,karna,kau,ku,lebih,maaf,melepas,mempertemukan,menahan,mencari,mencarimu,minta,penting,pisah,pundak,punya,rapuh,saat,salah,semua,siapa,tak,tangan,terlalu,tersandar,walau,yakin,yang
0,0.26712,0.226137,0.452274,0.226137,0.0,0.226137,0.226137,0.0,0.226137,0.171983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226137,0.0,0.226137,0.226137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.534241
1,0.148113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190723,0.190723,0.0,0.0,0.250778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250778,0.0,0.250778,0.250778,0.0,0.0,0.501555,0.250778,0.250778,0.250778,0.250778,0.0,0.250778,0.296227
2,0.223211,0.0,0.0,0.0,0.188965,0.0,0.0,0.188965,0.0,0.143713,0.188965,0.188965,0.287425,0.143713,0.188965,0.188965,0.0,0.0,0.188965,0.0,0.0,0.188965,0.37793,0.188965,0.0,0.188965,0.0,0.0,0.188965,0.188965,0.0,0.0,0.0,0.0,0.0,0.37793,0.0,0.334817


In [10]:
# Tugas Besar 1 Implementasi Content Based Filtering dengan TF-IDF.
# a. Copy lirik "Reff" dari lagu berbahasa Indonesia (masing-masing 3 lagu), 
# b. Paste ke link berikut (Pastikan 1 lagu 1 cell lirik_ref): 
# https://docs.google.com/spreadsheets/d/1j98gjzwL-88GPiTGkRgEWhWWcKryvNAdlaDR_oBWgpY/edit?usp=sharing
# c. Cari informasi mengenai k-NN, dan yang diimplementasikan pada TF-IDF pada dataset di atas
# d. Buat satu prosedur yang menerima ID dari lagu, dan kembalikan 5 ID most-similar items

In [11]:
df2 = df.transpose()
newdf = pd.DataFrame(columns=['words', 'vectorizer'])
newdf['words'] = df2.index

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
newdf['vectorizer'] = newdf['vectorizer'].fillna('')

tfidf_matrix = tfidf.fit_transform(newdf['words'])

tfidf_matrix.shape

(38, 38)

In [13]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [14]:
indices = pd.Series(newdf.index, index=newdf['words']).drop_duplicates()

In [15]:
def get_recommendations(title, cosine_sim = cosine_sim):
  idx = indices[title]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:6]
  movie_indices = [i[0] for i in sim_scores]
  return newdf['words'].iloc[movie_indices]

In [16]:
get_recommendations('kau')

0        aku
1    berbeda
2      bukan
3      cinta
4       dari
Name: words, dtype: object