In [1]:
import pandas as pd 
import numpy as np

data_komentar= pd.read_csv("comment_text_Preprocessing.csv",usecols=["Comments_tokens_stemmed"])
data_komentar.columns = ["comments"]

data_komentar.head()

Unnamed: 0,comments
0,['dapat']
1,"['senang', 'lihat', 'kalau', 'rusuh', 'teroris..."
2,"['lamban', 'langgar', 'ham', 'lihat', 'laku', ..."
3,"['terbang', 'curah', 'tinggi', 'kaki', 'selama..."
4,"['selamat', 'habis', 'pangacau', 'banyak', 'ny..."


In [2]:
# convert list formated string to list
import ast

def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

data_komentar["comments_list"] = data_komentar["comments"].apply(convert_text_list)


print(data_komentar["comments_list"][90])

print("\ntype : ", type(data_komentar["comments_list"][90]))

['selamat', 'tni', 'polri', 'jaya', 'tni', 'polriindonesia', 'raya']

type :  <class 'list'>


In [3]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

data_komentar["TF_dict"] = data_komentar['comments_list'].apply(calc_TF)

data_komentar["TF_dict"].head()

0                                       {'dapat': 1.0}
1    {'senang': 0.05555555555555555, 'lihat': 0.055...
2    {'lamban': 0.125, 'langgar': 0.125, 'ham': 0.1...
3    {'terbang': 0.125, 'curah': 0.125, 'tinggi': 0...
4    {'selamat': 0.125, 'habis': 0.125, 'pangacau':...
Name: TF_dict, dtype: object

In [4]:
# Check TF result
index = 90

print('%20s' % "term", "\t", "TF\n")
for key in data_komentar["TF_dict"][index]:
    print('%20s' % key, "\t", data_komentar["TF_dict"][index][key])

                term 	 TF

             selamat 	 0.14285714285714285
                 tni 	 0.2857142857142857
               polri 	 0.14285714285714285
                jaya 	 0.14285714285714285
      polriindonesia 	 0.14285714285714285
                raya 	 0.14285714285714285


In [5]:
def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

DF = calc_DF(data_komentar["TF_dict"])

data_komentar["DF_dict"] = data_komentar["TF_dict"].apply(calc_TF)

data_komentar["DF_dict"].head()

0                                       {'dapat': 1.0}
1    {'senang': 0.058823529411764705, 'lihat': 0.05...
2    {'lamban': 0.125, 'langgar': 0.125, 'ham': 0.1...
3    {'terbang': 0.14285714285714285, 'curah': 0.14...
4    {'selamat': 0.125, 'habis': 0.125, 'pangacau':...
Name: DF_dict, dtype: object

In [6]:
# Check TF result
index = 90

print('%20s' % "term", "\t", "DF\n")
for key in data_komentar["DF_dict"][index]:
    print('%20s' % key, "\t", data_komentar["DF_dict"][index][key])

                term 	 DF

             selamat 	 0.16666666666666666
                 tni 	 0.16666666666666666
               polri 	 0.16666666666666666
                jaya 	 0.16666666666666666
      polriindonesia 	 0.16666666666666666
                raya 	 0.16666666666666666


In [7]:
n_document = len(data_komentar)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict
  
#Stores the idf dictionary
IDF = calc_IDF(n_document, DF)

In [8]:
#calc TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

#Stores the TF-IDF Series
data_komentar["TF-IDF_dict"] = data_komentar["TF_dict"].apply(calc_TF_IDF)

In [9]:
# Check TF-IDF result
index = 90

print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in data_komentar["TF-IDF_dict"][index]:
    print('%20s' % key, "\t", data_komentar["TF_dict"][index][key] ,"\t" , data_komentar["TF-IDF_dict"][index][key])

                term 	         TF 	              TF-IDF

             selamat 	 0.14285714285714285 	 0.31651053085777364
                 tni 	 0.2857142857142857 	 0.45983940355260006
               polri 	 0.14285714285714285 	 0.32894072757057796
                jaya 	 0.14285714285714285 	 0.6714971951132024
      polriindonesia 	 0.14285714285714285 	 0.6714971951132024
                raya 	 0.14285714285714285 	 0.6714971951132024


In [10]:
# sort descending by value for DF dictionary 
sorted_DF = sorted(DF.items(), key=lambda kv: kv[1], reverse=True)[:20]

# Create a list of unique words from sorted dictionay `sorted_DF`
unique_term = [item[0] for item in sorted_DF]

def calc_TF_IDF_Vec(__TF_IDF_Dict):
    TF_IDF_vector = [0.0] * len(unique_term)

    # For each unique word, if it is in the review, store its TF-IDF value.
    for i, term in enumerate(unique_term):
        if term in __TF_IDF_Dict:
            TF_IDF_vector[i] = __TF_IDF_Dict[term]
    return TF_IDF_vector

data_komentar["TF_IDF_Vec"] = data_komentar["TF-IDF_dict"].apply(calc_TF_IDF_Vec)

print("print first row matrix TF_IDF_Vec Series\n")
print(data_komentar["TF_IDF_Vec"][1])

print("\nmatrix size : ", len(data_komentar["TF_IDF_Vec"][1]))

print first row matrix TF_IDF_Vec Series

[0.0, 0.0, 0.0, 0.12308742866691198, 0.25090370338035684, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.15714878827171247, 0.0, 0.16159560536468673, 0.0, 0.0, 0.0, 0.0, 0.17172458074212865]

matrix size :  20


In [11]:
# Convert Series to List
TF_IDF_Vec_List = np.array(data_komentar["TF_IDF_Vec"].to_list())

# Sum element vector in axis=0 
sums = TF_IDF_Vec_List.sum(axis=0)

data = []

for col, term in enumerate(unique_term):
    data.append((term, sums[col]))
    
ranking = pd.DataFrame(data, columns=['term', 'rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
0,tni,14.546705
17,mantap,10.417302
3,selamat,10.077091
5,polri,8.388508
2,saja,7.372248
16,ham,7.201928
1,kkb,7.125604
7,bunuh,6.721416
4,tangkap,6.548336
8,tembak,5.847388
