In [36]:
# Load data ke dalam data frame 
import pandas as pd

# Spesifikasi encoding diperlukan karena data tidak menggunakan UTF-8
df = pd.read_csv('scraping_avoskin.csv', encoding='latin-1')

df.head()

Unnamed: 0,Review,Product,Rating,Username,Date
0,"it's beautiful package, bintang 5 buat safety ...",Toner Avoskin Miraculous Refining 100ml-AHA BH...,bintang 5,Dafa,2024-01-30
1,tipe kulitku kering,Cream Malam Avoskin Ultra Brightening Cream 10...,bintang 5,F***d,2024-01-30
2,"pengiriman agak lama karena beda provinsi, tap...",Toner Avoskin Miraculous Refining 100ml-AHA BH...,bintang 5,A***a,2024-01-30
3,top..,Face Mist Avoskin Hydrating Treatment Essence ...,bintang 5,Dina,2024-01-30
4,tipe kulitku berminyak tipe kulitku kering tip...,Toner Avoskin Your Skin Bae Marine Collagen 10...,bintang 5,i***a,2024-01-30


In [37]:
# Drop kolom yang tidak terpakai
df = df.drop(df.iloc[:,1:], axis=1)

# Cek data teratas
df.head()

Unnamed: 0,Review
0,"it's beautiful package, bintang 5 buat safety ..."
1,tipe kulitku kering
2,"pengiriman agak lama karena beda provinsi, tap..."
3,top..
4,tipe kulitku berminyak tipe kulitku kering tip...


PREPROCESSING

In [38]:
# PROSES CASE FOLDING (menjadikan semua huruf di kolom Review menjadi huruf kecil)
df['Review'] = df['Review'].str.lower()

print('Case Folding Result: \n')
df.head()

Case Folding Result: 



Unnamed: 0,Review
0,"it's beautiful package, bintang 5 buat safety ..."
1,tipe kulitku kering
2,"pengiriman agak lama karena beda provinsi, tap..."
3,top..
4,tipe kulitku berminyak tipe kulitku kering tip...


In [39]:
# import hasil case folding menjadi csv
df.to_csv("casefolding_avoskin.csv")

In [40]:
# CLEANING
import re # regex library

def remove_tweet_special(text):
    # remove tab, new line, and back slice
    text = text.replace('\\t'," ").replace('\\n', " ").replace('\\u', " ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")

df['Review'] = df['Review'].apply(remove_tweet_special)

# remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['Review'] = df['Review'].apply(remove_number)

# remove punctuation and replace with space
def remove_punctuation(text):
    return re.sub(r'[.,]', ' ', text)

df['Review'] = df['Review'].apply(remove_punctuation)

# remove punctuation
def remove_symbol(text):
    # Menghapus simbol-simbol tidak standar dan menggantinya dengan spasi
    cleaned_text = re.sub(r'[^\w\s]', ' ', text)
    
    # Menghapus multiple whitespace
    cleaned_text = re.sub('\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

df['Review'] = df['Review'].apply(remove_symbol)

# Menentukan ambang batas panjang string acak
threshold_length = 20

# Fungsi untuk menghapus string acak berdasarkan panjangnya
def hapus_string_acak_dengan_panjang(text):
    # Memeriksa panjang string dan menghapus jika melebihi ambang batas
    return ' '.join(word for word in text.split() if len(word) <= threshold_length)

# Menggunakan fungsi untuk menghapus string acak pada kolom 'Text'
df['Review'] = df['Review'].apply(hapus_string_acak_dengan_panjang)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['Review'] = df['Review'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['Review'] = df['Review'].apply(remove_whitespace_multiple)

# remove single char
def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['Review'] = df['Review'].apply(remove_single_char)

def remove_laughter(text):
    laughter_patterns = r'\b((ha)+h*|(he)+h*|(hi)+h*|(wk)+w*k*|(eh)+e*|(ah)+a*|(ih)+i*|(kw)+k*w*|(hem)+m*)\b'
    return re.sub(laughter_patterns, '', text, flags=re.IGNORECASE)

df['Review'] = df['Review'].apply(remove_laughter)

print('Cleaning Result : \n') 
print(df.head())

Cleaning Result : 

                                              Review
0      it  beautiful package bintang buat safety nya
1                                tipe kulitku kering
2  pengiriman agak lama karena beda provinsi tapi...
3                                                top
4  tipe kulitku berminyak tipe kulitku kering tip...


In [41]:
# import hasil cleaning menjadi csv
df.to_csv("cleaning_avoskin.csv")

In [42]:
# REMOVE DUPLICATE
df = df.drop_duplicates()
df = df.reset_index(drop=True)
# Menghapus baris yang kosong
df = df.dropna(subset=['Review'])
# Menghapus baris yang hanya berisi spasi atau whitespace
df = df[df['Review'].str.strip() != '']
df.head()

Unnamed: 0,Review
0,it beautiful package bintang buat safety nya
1,tipe kulitku kering
2,pengiriman agak lama karena beda provinsi tapi...
3,top
4,tipe kulitku berminyak tipe kulitku kering tip...


In [43]:
# import hasil remove duplicate menjadi csv
df.to_csv("removedup_avoskin.csv")

In [44]:
# NORMALIZATION
import pandas as pd
import re

slang_dictionary = pd.read_csv('../colloquial-indonesian-lexicon2.csv')
slang_dict = pd.Series(slang_dictionary['formal'].values,index=slang_dictionary['slang']).to_dict()

slang_dictionary.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1.0,wow,elongasi,0,0
1,aminn,amin,1.0,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1.0,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1.0,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0.0,Birthday yg keberpa kak?,abreviasi,0,0


In [45]:
# Normalisasi kata menggunakan kamus colloquial-indonesian-lexicon2.csv
def Slangwords(text, slang_dict):
    for word in text.split():
        if word in slang_dict.keys():
            # menambahkan \b untuk menandakan batas kata di sekitar kata slang
            text = re.sub(r'\b{}\b'.format(re.escape(word)), slang_dict[word], text)
    
    text = re.sub('@[\w]+', '', text)
    return text

df['Review_norm'] = df['Review'].apply(lambda x: Slangwords(x, slang_dict))
print(df.head())

                                              Review  \
0      it  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                         Review_norm  
0     itu  beautiful package bintang buat safety nya  
1                                tipe kulitku kering  
2  pengiriman agak lama karena beda provinsi tapi...  
3                                                top  
4  tipe kulitku berminyak tipe kulitku kering tip...  


In [46]:
# import hasil replace slang words menjadi csv
df.to_csv("normalization_avoskin.csv")

In [47]:
# PROSES TOKENIZING (word_tokenize() untuk memecah string kedalam tokens)

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Tokenizing
# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['review_tokens'] = df['Review_norm'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df.head())

Tokenizing Result : 

                                              Review  \
0      it  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                         Review_norm  \
0     itu  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                       review_tokens  
0  [itu, beautiful, package, bintang, buat, safet...  
1                            [tipe, kulitku, kering]  
2  [pengiriman, agak, lama, karena, beda, provins...  
3                                              [top]  
4  [tipe, kulitku, berminyak,

In [48]:
# import hasil tokenizing menjadi csv
df.to_csv("tokenizing_avoskin.csv")

In [49]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
# PROSES FILTERING (Stopword Removal) menggunakan library sastrawi
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

# remove stopword pada list token
def stopwords_removal(words):
    filtered_words = [stopword.remove(w) for w in words]
    filtered_words = [word for word in filtered_words if word != '']  # Memfilter kata-kata kosong
    return filtered_words

df['review_tokens_SR'] = df['review_tokens'].apply(stopwords_removal) 

print(df.head())

                                              Review  \
0      it  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                         Review_norm  \
0     itu  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                       review_tokens  \
0  [itu, beautiful, package, bintang, buat, safet...   
1                            [tipe, kulitku, kering]   
2  [pengiriman, agak, lama, karena, beda, provins...   
3                                              [top]   
4  [tipe, kulitku, berminyak, tipe, kulitku, k

In [50]:
# import hasil stopword removal menjadi csv
df.to_csv("stopword_avoskin.csv")

In [51]:
# PROSES LEMMATIZATION 
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['review_tokens_SR']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['review_tokens_stemmed'] = df['review_tokens_SR'].swifter.apply(get_stemmed_term)
print(df['review_tokens_stemmed'])
print(df.head())

1039
------------------------


beautiful : beautiful
package : package
bintang : bintang
buat : buat
safety : safety
nya : nya
tipe : tipe
kulitku : kulit
kering : kering
pengiriman : kirim
lama : lama
beda : beda
provinsi : provinsi
gapapa : gapapa
packingnya : packingnya
aman : aman
banget : banget
semoga : moga
cocok : cocok
memperbaiki : baik
kulit : kulit
aku : aku
berminyak : minyak
top : top
normal : normal
kombinasi : kombinasi
masalah : masalah
kerutan : kerut
jerawat : jerawat
pori : pori
besar : besar
sensitif : sensitif
exp : exp
no : no
review : review
found : found
pertama : pertama
dulu : dulu
sih : sih
beli : beli
bagus : bagus
muka : muka
waktu : waktu
kemasan : kemas
biru : biru
mungkin : mungkin
tahun : tahun
lalu : lalu
viral : viral
perskincare : perskincare
an : an
semenjak : semenjak
hamil : hamil
stop : stop
pakai : pakai
produk : produk
bayi : bayi
mau : mau
coba : coba
kangen : kangen
sensasinya : sensasi
memakai : pakai
memang : memang
feelnya : feelnya
ngaruh : ngaruh
suka : suka
muncul :

Pandas Apply: 100%|██████████| 401/401 [00:00<00:00, 21404.68it/s]

0       [beautiful, package, bintang, buat, safety, nya]
1                                  [tipe, kulit, kering]
2      [kirim, lama, beda, provinsi, gapapa, packingn...
3                                                  [top]
4      [tipe, kulit, minyak, tipe, kulit, kering, tip...
                             ...                        
397    [melembabkan, banget, tekstur, kental, cepat, ...
398         [beli, cocok, kulit, enggak, bikin, iritasi]
399         [order, polypeptide, terima, aqua, ceramide]
400    [produk, sesuai, deskripsi, jual, recommended,...
401    [selalu, pakai, cocok, enggak, bikin, kulit, k...
Name: review_tokens_stemmed, Length: 401, dtype: object
                                              Review  \
0      it  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitk




In [52]:
# Menghapus baris yang kosong
df = df.dropna(subset=['review_tokens_stemmed'])
# Menghapus baris yang hanya berisi spasi atau whitespace
df = df[df['review_tokens_stemmed'].str.strip() != '']
# Menghapus baris yang memiliki list kosong
df = df[df['review_tokens_stemmed'].apply(lambda x: x != [])]
df.head(15)

Unnamed: 0,Review,Review_norm,review_tokens,review_tokens_SR,review_tokens_stemmed
0,it beautiful package bintang buat safety nya,itu beautiful package bintang buat safety nya,"[itu, beautiful, package, bintang, buat, safet...","[beautiful, package, bintang, buat, safety, nya]","[beautiful, package, bintang, buat, safety, nya]"
1,tipe kulitku kering,tipe kulitku kering,"[tipe, kulitku, kering]","[tipe, kulitku, kering]","[tipe, kulit, kering]"
2,pengiriman agak lama karena beda provinsi tapi...,pengiriman agak lama karena beda provinsi tapi...,"[pengiriman, agak, lama, karena, beda, provins...","[pengiriman, lama, beda, provinsi, gapapa, pac...","[kirim, lama, beda, provinsi, gapapa, packingn..."
3,top,top,[top],[top],[top]
4,tipe kulitku berminyak tipe kulitku kering tip...,tipe kulitku berminyak tipe kulitku kering tip...,"[tipe, kulitku, berminyak, tipe, kulitku, keri...","[tipe, kulitku, berminyak, tipe, kulitku, keri...","[tipe, kulit, minyak, tipe, kulit, kering, tip..."
5,exp,exp,[exp],[exp],[exp]
6,no review found,no review found,"[no, review, found]","[no, review, found]","[no, review, found]"
7,pertama banget dulu sih beli bagus bgt ya di m...,pertama banget dulu sih beli bagus banget ya d...,"[pertama, banget, dulu, sih, beli, bagus, bang...","[pertama, banget, dulu, sih, beli, bagus, bang...","[pertama, banget, dulu, sih, beli, bagus, bang..."
8,tipe kulitku berminyak,tipe kulitku berminyak,"[tipe, kulitku, berminyak]","[tipe, kulitku, berminyak]","[tipe, kulit, minyak]"
9,tipe kulitku berminyak masalah kulitku pori besar,tipe kulitku berminyak masalah kulitku pori besar,"[tipe, kulitku, berminyak, masalah, kulitku, p...","[tipe, kulitku, berminyak, masalah, kulitku, p...","[tipe, kulit, minyak, masalah, kulit, pori, be..."


In [53]:
# import hasil preprocessing menjadi csv
df.to_csv("lemma_avoskin.csv")