# Ekstraksi Fitur Term Weighting Manual

In [93]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import ast

In [94]:
data = pd.read_csv("data/data/1.2_data_cleaner_pure.csv")
data.head()

Unnamed: 0,rating,review,aspect_category,polarity,word_count,avg_word
0,3,"'membantu','kondisi','pandemi','tolong','evalu...",pelayanan,konflik,43,7.822222
1,5,"'konsultasi','dokter','bagus','durasi','cepat'...",pelayanan,konflik,12,7.923077
2,1,"'astaga','obat','beli','obat','harga','masuk',...",harga,negatif,33,7.393939
3,5,"'terima,kasih','pelayanan','cepat','pengiriman...",pelayanan,positif,11,9.0
4,1,"'menelepon','nomor','hadiah','konfirmasi','apl...",sistem,netral,10,8.909091


In [95]:
data['review'] = data['review'].apply(lambda x: x.replace("'",""))
data.head()

Unnamed: 0,rating,review,aspect_category,polarity,word_count,avg_word
0,3,"membantu,kondisi,pandemi,tolong,evaluasi,keter...",pelayanan,konflik,43,7.822222
1,5,"konsultasi,dokter,bagus,durasi,cepat,dokter,di...",pelayanan,konflik,12,7.923077
2,1,"astaga,obat,beli,obat,harga,masuk,akal,kali,li...",harga,negatif,33,7.393939
3,5,"terima,kasih,pelayanan,cepat,pengiriman,thanks...",pelayanan,positif,11,9.0
4,1,"menelepon,nomor,hadiah,konfirmasi,aplikasi,men...",sistem,netral,10,8.909091


In [96]:
data['review']= data['review'].apply(lambda x: x.replace(","," "))
data.head()

Unnamed: 0,rating,review,aspect_category,polarity,word_count,avg_word
0,3,membantu kondisi pandemi tolong evaluasi keter...,pelayanan,konflik,43,7.822222
1,5,konsultasi dokter bagus durasi cepat dokter di...,pelayanan,konflik,12,7.923077
2,1,astaga obat beli obat harga masuk akal kali li...,harga,negatif,33,7.393939
3,5,terima kasih pelayanan cepat pengiriman thanks...,pelayanan,positif,11,9.0
4,1,menelepon nomor hadiah konfirmasi aplikasi men...,sistem,netral,10,8.909091


In [97]:
data['review']= data['review'].apply(lambda x: x.split())
data.head()

Unnamed: 0,rating,review,aspect_category,polarity,word_count,avg_word
0,3,"[membantu, kondisi, pandemi, tolong, evaluasi,...",pelayanan,konflik,43,7.822222
1,5,"[konsultasi, dokter, bagus, durasi, cepat, dok...",pelayanan,konflik,12,7.923077
2,1,"[astaga, obat, beli, obat, harga, masuk, akal,...",harga,negatif,33,7.393939
3,5,"[terima, kasih, pelayanan, cepat, pengiriman, ...",pelayanan,positif,11,9.0
4,1,"[menelepon, nomor, hadiah, konfirmasi, aplikas...",sistem,netral,10,8.909091


In [98]:
print(data['review'])

0      [membantu, kondisi, pandemi, tolong, evaluasi,...
1      [konsultasi, dokter, bagus, durasi, cepat, dok...
2      [astaga, obat, beli, obat, harga, masuk, akal,...
3      [terima, kasih, pelayanan, cepat, pengiriman, ...
4      [menelepon, nomor, hadiah, konfirmasi, aplikas...
                             ...                        
795                 [semudah, diklik, praktis, membantu]
796                 [pelayanan, ramah, paham, bidangnya]
797    [bagus, aplikasi, dokter, pribadi, terima, kasih]
798                      [memudahkan, konsultasi, rumah]
799             [membantu, pelayanan, dokter, memuaskan]
Name: review, Length: 800, dtype: object


# Menghitung Term Frequency (TF) atau frekuensi kemunculan term pada dokumen

![image.png](attachment:image.png)

In [99]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

data["TF_dict"] = data['review'].apply(calc_TF)

data["TF_dict"].head()

0    {'membantu': 0.022222222222222223, 'kondisi': ...
1    {'konsultasi': 0.07692307692307693, 'dokter': ...
2    {'astaga': 0.030303030303030304, 'obat': 0.121...
3    {'terima': 0.08333333333333333, 'kasih': 0.083...
4    {'menelepon': 0.09090909090909091, 'nomor': 0....
Name: TF_dict, dtype: object

# Simpan pencarian TF pada data

In [100]:
# Check TF result
index = 355

print('%20s' % "term", "\t", "TF\n")
for key in data["TF_dict"][index]:
    print('%20s' % key, "\t", data["TF_dict"][index][key])

                term 	 TF

            membantu 	 0.2
              respon 	 0.2
               cepat 	 0.2
              terima 	 0.2
               kasih 	 0.2


# Menghitung Inverse Document Frequency (IDF) atau mengurangi bobot suatu term jika kemunculannya banyak tersebar diseluruh dokumen

In [101]:
def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

DF = calc_DF(data["TF_dict"])

# Hasil term value

In [102]:
DF

{'membantu': 300,
 'kondisi': 25,
 'pandemi': 70,
 'tolong': 24,
 'evaluasi': 3,
 'ketersedian': 1,
 'obat': 125,
 'apotek': 34,
 'rekanan': 2,
 'kota': 5,
 'manado': 1,
 'resep': 30,
 'batuk': 2,
 'suka': 13,
 'habis': 6,
 'sia': 2,
 'konsultasi': 148,
 'dokter': 243,
 'ujung': 1,
 'rugi': 2,
 'chat': 44,
 'dobel': 1,
 'biaya': 13,
 'masukan': 4,
 'pakai': 33,
 'fitur': 9,
 'us': 1,
 'customer': 21,
 'service': 15,
 'mengubah': 1,
 'produk': 2,
 'menyarankan': 2,
 'bagus': 150,
 'durasi': 1,
 'cepat': 119,
 'di': 22,
 'akhir': 1,
 'chatingan': 1,
 'oke': 6,
 'bantu': 1,
 'terimakasih': 14,
 'astaga': 1,
 'beli': 25,
 'harga': 11,
 'masuk': 12,
 'akal': 1,
 'kali': 37,
 'lipat': 1,
 'replay': 1,
 'bantuan': 7,
 'belas': 12,
 'coba': 12,
 'pembayaran': 21,
 'bank': 6,
 'berbeda': 3,
 'gagal': 6,
 'solusi': 14,
 'penyelesain': 1,
 'berulang': 2,
 'semoga': 45,
 'banding': 1,
 'pesaing': 1,
 'terima': 167,
 'kasih': 183,
 'pelayanan': 55,
 'pengiriman': 13,
 'thanks': 8,
 'tertawa': 21,
 

In [103]:
n_document = len(data)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict
  
#Stores the idf dictionary
IDF = calc_IDF(n_document, DF)

In [104]:
IDF

{'membantu': 0.9775014629190516,
 'kondisi': 3.4265151896464454,
 'pandemi': 2.4219318506266116,
 'tolong': 3.4657359027997265,
 'evaluasi': 5.298317366548036,
 'ketersedian': 5.991464547107982,
 'obat': 1.8483298207164491,
 'apotek': 3.1292636661785136,
 'rekanan': 5.585999438999818,
 'kota': 4.892852258439873,
 'manado': 5.991464547107982,
 'resep': 3.250624523182781,
 'batuk': 5.585999438999818,
 'suka': 4.045554398052669,
 'habis': 4.738701578612614,
 'sia': 5.585999438999818,
 'konsultasi': 1.6806654217224681,
 'dokter': 1.1874435023747254,
 'ujung': 5.991464547107982,
 'rugi': 5.585999438999818,
 'chat': 2.8779492378976075,
 'dobel': 5.991464547107982,
 'biaya': 4.045554398052669,
 'masukan': 5.075173815233827,
 'pakai': 3.158251203051766,
 'fitur': 4.382026634673881,
 'us': 5.991464547107982,
 'customer': 3.5935692743096115,
 'service': 3.912023005428146,
 'mengubah': 5.991464547107982,
 'produk': 5.585999438999818,
 'menyarankan': 5.585999438999818,
 'bagus': 1.667331890853003,

# Menggabungkan hasil TF-IDF

In [105]:
#calc TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

#Stores the TF-IDF Series
data["TF-IDF_dict"] = data["TF_dict"].apply(calc_TF_IDF)

In [115]:
# Check TF-IDF result
index = 43

print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in data["TF-IDF_dict"][index]:
    print('%20s' % key, "\t", data["TF_dict"][index][key] ,"\t" , data["TF-IDF_dict"][index][key])

                term 	         TF 	              TF-IDF

            aplikasi 	 0.18181818181818182 	 0.2903664349882806
              lambat 	 0.09090909090909091 	 0.40794428639379166
                coba 	 0.09090909090909091 	 0.37451476092785374
            uninstal 	 0.09090909090909091 	 0.5446785951916348
             telepon 	 0.18181818181818182 	 0.7002542515657657
             genggam 	 0.18181818181818182 	 0.8896095015345223
               berat 	 0.09090909090909091 	 0.4816652151407306
               jalan 	 0.09090909090909091 	 0.5078181308181653


In [60]:
# sort descending by value for DF dictionary 
sorted_DF = sorted(DF.items(), key=lambda kv: kv[1], reverse=True)[:50]

# Create a list of unique words from sorted dictionay `sorted_DF`
unique_term = [item[0] for item in sorted_DF]

def calc_TF_IDF_Vec(__TF_IDF_Dict):
    TF_IDF_vector = [0.0] * len(unique_term)

    # For each unique word, if it is in the review, store its TF-IDF value.
    for i, term in enumerate(unique_term):
        if term in __TF_IDF_Dict:
            TF_IDF_vector[i] = __TF_IDF_Dict[term]
    return TF_IDF_vector

data["TF_IDF_Vec"] = data["TF-IDF_dict"].apply(calc_TF_IDF_Vec)

print("print first row matrix TF_IDF_Vec Series\n")
print(data["TF_IDF_Vec"][0])

print("\nmatrix size : ", len(data["TF_IDF_Vec"][0]))

print first row matrix TF_IDF_Vec Series

[0.021722254731534482, 0.05277526677221002, 0.0, 0.0, 0.0, 0.0, 0.07469624096544303, 0.16429598406368437, 0.0, 0.053820707791702484, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12790885501767144, 0.0, 0.0, 0.20861757774523423, 0.07018336006781703, 0.0, 0.14447220103034583, 0.0, 0.07614478199214324, 0.0, 0.0, 0.07701635339554948, 0.0, 0.0, 0.0, 0.15971418996931608, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.17386768913013984]

matrix size :  50


In [61]:
# Convert Series to List
TF_IDF_Vec_List = np.array(data["TF_IDF_Vec"].to_list())

# Sum element vector in axis=0 
sums = TF_IDF_Vec_List.sum(axis=0)

data_list = []

for col, term in enumerate(unique_term):
    data_list.append((term, sums[col]))
    
ranking = pd.DataFrame(data_list, columns=['term', 'rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
0,membantu,101.996204
5,bagus,51.543878
1,dokter,49.24195
8,cepat,41.571886
4,aplikasi,41.455913
2,kasih,38.949074
3,terima,37.334848
6,konsultasi,36.557713
9,pandemi,34.929028
11,ramah,28.868241


# POS TAGGING UNTUK MENCARI ASPEK TERM

In [98]:
dataset = pd.read_csv("data/data/1.2_data_cleaner_pure.csv")
dataset.head()

Unnamed: 0,rating,review,aspect_category,polarity,word_count,avg_word
0,3,"'membantu','kondisi','pandemi','tolong','dieva...",pelayanan,konflik,47,7.959184
1,5,"'konsultasi','dokter','halo','doc','bagus','si...",pelayanan,konflik,19,6.52381
2,1,"'astaga','obat','beli','obat','halo','doc','ha...",harga,negatif,38,7.078947
3,5,"'terima,kasih','halodoc','pelayanan','cepat','...",pelayanan,positif,14,9.0
4,1,"'menelepon','nomor','halodoc','hadiah','halodo...",sistem,netral,13,8.928571


In [99]:
dataset['review'] = dataset['review'].apply(lambda x: x.replace("'",""))
dataset.head()

Unnamed: 0,rating,review,aspect_category,polarity,word_count,avg_word
0,3,"membantu,kondisi,pandemi,tolong,dievaluasi,ket...",pelayanan,konflik,47,7.959184
1,5,"konsultasi,dokter,halo,doc,bagus,sih,durasi,ce...",pelayanan,konflik,19,6.52381
2,1,"astaga,obat,beli,obat,halo,doc,harga,masuk,aka...",harga,negatif,38,7.078947
3,5,"terima,kasih,halodoc,pelayanan,cepat,pengirima...",pelayanan,positif,14,9.0
4,1,"menelepon,nomor,halodoc,hadiah,halodoc,konfirm...",sistem,netral,13,8.928571


In [100]:
dataset['review'] = dataset['review'].apply(lambda x: x.replace(","," "))
dataset.head()

Unnamed: 0,rating,review,aspect_category,polarity,word_count,avg_word
0,3,membantu kondisi pandemi tolong dievaluasi ket...,pelayanan,konflik,47,7.959184
1,5,konsultasi dokter halo doc bagus sih durasi ce...,pelayanan,konflik,19,6.52381
2,1,astaga obat beli obat halo doc harga masuk aka...,harga,negatif,38,7.078947
3,5,terima kasih halodoc pelayanan cepat pengirima...,pelayanan,positif,14,9.0
4,1,menelepon nomor halodoc hadiah halodoc konfirm...,sistem,netral,13,8.928571


In [101]:
from nltk import word_tokenize
from nltk.tag import CRFTagger

In [102]:
def preprocessing(text):
    tokenized = word_tokenize(text)
    return tokenized

In [103]:
dataset['review'] = dataset['review'].apply(preprocessing)
dataset.head()

Unnamed: 0,rating,review,aspect_category,polarity,word_count,avg_word
0,3,"[membantu, kondisi, pandemi, tolong, dievaluas...",pelayanan,konflik,47,7.959184
1,5,"[konsultasi, dokter, halo, doc, bagus, sih, du...",pelayanan,konflik,19,6.52381
2,1,"[astaga, obat, beli, obat, halo, doc, harga, m...",harga,negatif,38,7.078947
3,5,"[terima, kasih, halodoc, pelayanan, cepat, pen...",pelayanan,positif,14,9.0
4,1,"[menelepon, nomor, halodoc, hadiah, halodoc, k...",sistem,netral,13,8.928571


In [104]:
dataset = dataset['review']

In [105]:
ct = CRFTagger()
ct.set_model_file("data/external/all_indo_man_tag_corpus_model.crf.tagger")

In [106]:
pos_ulasan = ct.tag_sents(dataset)

In [107]:
pos_ulasan

[[('membantu', 'VB'),
  ('kondisi', 'NN'),
  ('pandemi', 'NN'),
  ('tolong', 'VB'),
  ('dievaluasi', 'NN'),
  ('ketersedian', 'NN'),
  ('obat', 'NN'),
  ('apotek', 'NN'),
  ('rekanan', 'NN'),
  ('halodoc', 'NN'),
  ('kota', 'NN'),
  ('manado', 'NN'),
  ('resep', 'NN'),
  ('obat', 'NN'),
  ('batuk', 'VB'),
  ('apotek', 'NN'),
  ('suka', 'VB'),
  ('habis', 'NN'),
  ('sia', 'NN'),
  ('sia', 'NN'),
  ('konsultasi', 'NN'),
  ('dokter', 'NN'),
  ('ujung', 'NN'),
  ('ujung', 'NN'),
  ('obat', 'NN'),
  ('diresepkan', 'VB'),
  ('apotek', 'NN'),
  ('rugi', 'JJ'),
  ('chat', 'VB'),
  ('dobel', 'NN'),
  ('biaya', 'NN'),
  ('masukan', 'VB'),
  ('halodoc', 'NN'),
  ('pakai', 'VB'),
  ('fitur', 'NN'),
  ('chat', 'VB'),
  ('us', 'FW'),
  ('customer', 'FW'),
  ('service', 'FW'),
  ('halodoc', 'FW'),
  ('sb', 'FW'),
  ('customer', 'FW'),
  ('service', 'FW'),
  ('mengubah', 'VB'),
  ('produk', 'NN'),
  ('obat', 'NN'),
  ('menyarankan', 'VB'),
  ('konsultasi', 'NN'),
  ('dokter', 'NN')],
 [('konsultasi', 

In [108]:
pos_ulasan_nn = [pos for pos in pos_ulasan if pos[1] == "NN"]

IndexError: list index out of range

In [79]:
# list of name, degree, score 
nme = ["aparna", "pankaj", "sudhir", "Geeku"] 
deg = ["MBA", "BCA", "M.Tech", "MBA"] 
scr = [90, 40, 80, 98] 
     
# dictionary of lists  
dict = {'postag': pos_ulasan}  
       
df = pd.DataFrame(dict) 
    
# saving the dataframe 
df.to_csv('data/data/pos_tag.csv') 

In [80]:
df.head(800)

Unnamed: 0,postag
0,"[(membantu, VB), (kondisi, NN), (pandemi, NN),..."
1,"[(konsultasi, NN), (dokter, FW), (halo, FW), (..."
2,"[(astaga, NN), (obat, NN), (beli, VB), (obat, ..."
3,"[(terima, CD), (kasih, NN), (halodoc, NN), (pe..."
4,"[(menelepon, NN), (nomor, NN), (halodoc, FW), ..."
...,...
795,"[(semudah, MD), (diklik, VB), (praktis, JJ), (..."
796,"[(pelayanan, NN), (ramah, NN), (paham, NN), (b..."
797,"[(bagus, JJ), (aplikasi, NN), (pu, NN), (dokte..."
798,"[(memudahkan, VB), (konsultasi, NN), (rumah, NN)]"


In [81]:
df.to_excel('data/data/pos_tag_excel.xlsx',index=False)