In [77]:
# Load data ke dalam data frame 
import pandas as pd

# spesifikasi encoding diperlukan karena data tidak menggunakan UTF-8
df = pd.read_csv('scraping_skingame.csv', encoding='latin-1')

df.head()

Unnamed: 0,Review,Product,Rating,Username
0,No review found,Skin Game Kind Watery Moisturizer - Pelembab W...,bintang 5,Nurul
1,"pengiriman sengaja pakai gosend biar cepat, ta...",Skin Game Kind Watery Moisturizer - Pelembab W...,bintang 3,Firman
2,"cepat meresap, ringan di kulit wajah, cuman bu...",Skin Game Daily Kind Moisturizer 50 ml,bintang 5,I***y
3,Seller sangat cepat memproses barang. Packing ...,Skin Game Kind Cream Moisturizer - Pelembab Wa...,bintang 5,Hesty
4,Cocok di kulit normal,Skin Game Acne Warrior Paste 15 ml - Krim Jerawat,bintang 5,F***i


In [78]:
# Drop kolom yang tidak terpakai
df = df.drop(df.iloc[:,1:], axis=1)

# Cek data teratas
df.head()

Unnamed: 0,Review
0,No review found
1,"pengiriman sengaja pakai gosend biar cepat, ta..."
2,"cepat meresap, ringan di kulit wajah, cuman bu..."
3,Seller sangat cepat memproses barang. Packing ...
4,Cocok di kulit normal


PREPROCESSING

In [79]:
# PROSES CASE FOLDING (menjadikan semua huruf di kolom content menjadi huruf kecil)
df['Review'] = df['Review'].str.lower()

print('Case Folding Result: \n')
df.head()

Case Folding Result: 



Unnamed: 0,Review
0,no review found
1,"pengiriman sengaja pakai gosend biar cepat, ta..."
2,"cepat meresap, ringan di kulit wajah, cuman bu..."
3,seller sangat cepat memproses barang. packing ...
4,cocok di kulit normal


In [80]:
# import hasil case folding menjadi csv
df.to_csv("casefolding.csv")

In [81]:
# CLEANING
import string
import re #regex library

def remove_tweet_special(text):
    #remove tab, new line, and back slice
    text = text.replace('\\t'," ").replace('\\n', " ").replace('\\u', " ").replace('\\',"")
    #remove non ASCII (emoticon, chinese word, etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")

df['Review'] = df['Review'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['Review'] = df['Review'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

df['Review'] = df['Review'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['Review'] = df['Review'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['Review'] = df['Review'].apply(remove_whitespace_multiple)

# remove single char
def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['Review'] = df['Review'].apply(remove_single_char)

print('Cleaning Result : \n') 
print(df.head())

Cleaning Result : 

                                              Review
0                                    no review found
1  pengiriman sengaja pakai gosend biar cepat tap...
2  cepat meresap ringan di kulit wajah cuman buat...
3  seller sangat cepat memproses barang packing s...
4                              cocok di kulit normal


In [82]:
# import hasil cleaning menjadi csv
df.to_csv("cleaning.csv")

In [83]:
# REMOVE DUPLICATE
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Review
0,no review found
1,pengiriman sengaja pakai gosend biar cepat tap...
2,cepat meresap ringan di kulit wajah cuman buat...
3,seller sangat cepat memproses barang packing s...
4,cocok di kulit normal


In [84]:
# import hasil remove duplicate menjadi csv
df.to_csv("removedup.csv")

In [85]:
# NORMALIZATION
import pandas as pd
import re
import csv
import string

slang_dictionary = pd.read_csv('colloquial-indonesian-lexicon2.csv')
slang_dict = pd.Series(slang_dictionary['formal'].values,index=slang_dictionary['slang']).to_dict()

slang_dictionary.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1.0,wow,elongasi,0,0
1,aminn,amin,1.0,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1.0,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1.0,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0.0,Birthday yg keberpa kak?,abreviasi,0,0


In [86]:
def Slangwords(text):
    for word in text.split():
        if word in slang_dict.keys():
            text = text.replace(word, slang_dict[word])
    
    text = re.sub('@[\w]+', '',text)
    return text

df['Review_norm'] = df['Review'].apply(Slangwords)
df.head()

Unnamed: 0,Review,Review_norm
0,no review found,no review found
1,pengiriman sengaja pakai gosend biar cepat tap...,pengiriman senenggakja pakai gosend biar cepat...
2,cepat meresap ringan di kulit wajah cuman buat...,cepat meresap ringan di kulit wajah cuman buat...
3,seller sangat cepat memproses barang packing s...,seller sangat cepat memproses barang packing s...
4,cocok di kulit normal,cocok di kulit normal


In [87]:
# import hasil replace slang words menjadi csv
df.to_csv("normalization.csv")

In [88]:
# PROSES TOKENIZING (word_tokenize() untuk memecah string kedalam tokens)

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Tokenizing
# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['review_tokens'] = df['Review_norm'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df.head())

Tokenizing Result : 

                                              Review  \
0                                    no review found   
1  pengiriman sengaja pakai gosend biar cepat tap...   
2  cepat meresap ringan di kulit wajah cuman buat...   
3  seller sangat cepat memproses barang packing s...   
4                              cocok di kulit normal   

                                         Review_norm  \
0                                    no review found   
1  pengiriman senenggakja pakai gosend biar cepat...   
2  cepat meresap ringan di kulit wajah cuman buat...   
3  seller sangat cepat memproses barang packing s...   
4                              cocok di kulit normal   

                                       review_tokens  
0                                [no, review, found]  
1  [pengiriman, senenggakja, pakai, gosend, biar,...  
2  [cepat, meresap, ringan, di, kulit, wajah, cum...  
3  [seller, sangat, cepat, memproses, barang, pac...  
4                         [co

In [89]:
# import hasil tokenizing menjadi csv
df.to_csv("tokenizing.csv")

In [90]:
# PROSES FILTERING (Stopword Removal)
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df['review_tokens_SR'] = df['review_tokens'].apply(stopwords_removal) 

print(df.head())

                                              Review  \
0                                    no review found   
1  pengiriman sengaja pakai gosend biar cepat tap...   
2  cepat meresap ringan di kulit wajah cuman buat...   
3  seller sangat cepat memproses barang packing s...   
4                              cocok di kulit normal   

                                         Review_norm  \
0                                    no review found   
1  pengiriman senenggakja pakai gosend biar cepat...   
2  cepat meresap ringan di kulit wajah cuman buat...   
3  seller sangat cepat memproses barang packing s...   
4                              cocok di kulit normal   

                                       review_tokens  \
0                                [no, review, found]   
1  [pengiriman, senenggakja, pakai, gosend, biar,...   
2  [cepat, meresap, ringan, di, kulit, wajah, cum...   
3  [seller, sangat, cepat, memproses, barang, pac...   
4                         [cocok, di, kulit, n

In [91]:
# import hasil sr menjadi csv
df.to_csv("stopword.csv")

In [92]:
# PROSES LEMMATIZATION 
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['review_tokens_SR']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['review_tokens_stemmed'] = df['review_tokens_SR'].swifter.apply(get_stemmed_term)
print(df['review_tokens_stemmed'])
print(df.head())

1231
------------------------
no : no


review : review
found : found
pengiriman : kirim
senenggakja : senenggakja
pakai : pakai
gosend : gosend
biar : biar
cepat : cepat
dikirim : kirim
disuruh : suruh
reorder : reorder
denenggakn : denenggakn
alasan : alas
kompensasi : kompensasi
taunya : tau
pas : pas
meresap : resap
ringan : ringan
kulit : kulit
wajah : wajah
cuman : cuman
dryskin : dryskin
krg : krg
melembabkan : melembabkan
enak : enak
prep : prep
makeup : makeup
seller : seller
memproses : proses
barang : barang
packing : packing
aman : aman
reliable : reliable
cocok : cocok
normal : normal
selangnya : selang
sependek : pendek
sih : sih
hilang : hilang
kebawah : bawah
wtf : wtf
annoying : annoying
banget : banget
pakainya : pakai
coba : coba
cari : cari
tube : tube
ajalah : aja
kalo : kalo
packaging : packaging
kayak : kayak
gi : gi
mendaratpelembab : mendaratpelembab
kesayangan : sayang
mantap : mantap
pesanan : pesan
sesuai : sesuai
responsif : responsif
tebal : tebal
bravo : bravo
packagingnya : packagingnya
diperb

Pandas Apply: 100%|██████████| 457/457 [00:00<00:00, 57149.58it/s]

0                                    [no, review, found]
1      [kirim, senenggakja, pakai, gosend, biar, cepa...
2      [cepat, resap, ringan, kulit, wajah, cuman, dr...
3      [seller, cepat, proses, barang, packing, aman,...
4                                 [cocok, kulit, normal]
                             ...                        
452    [serum, ampuh, muka, sensitif, minyak, acne, p...
453    [tekstur, enak, banget, engenggak, bau, aneh, ...
454    [pakai, bulan, cocok, kirim, cepat, bagus, all...
455    [for, the, first, time, cowok, moist, skingame...
456    [pikir, kental, cair, ya, cepat, resap, sih, m...
Name: review_tokens_stemmed, Length: 457, dtype: object
                                              Review  \
0                                    no review found   
1  pengiriman sengaja pakai gosend biar cepat tap...   
2  cepat meresap ringan di kulit wajah cuman buat...   
3  seller sangat cepat memproses barang packing s...   
4                              cocok 




In [93]:
# import hasil preprocessing menjadi csv
df.to_csv("lemma.csv")