In [37]:
# Load data ke dalam data frame 
import pandas as pd

# Spesifikasi encoding diperlukan karena data tidak menggunakan UTF-8
df = pd.read_csv('scraping_somethinc.csv', encoding='latin-1')

df.head()

Unnamed: 0,Review,Product,Rating,Username,Date
0,masalah kulitku kerutan,SOMETHINC BAKUCHIOL Skinpair Oil Serum,bintang 5,A***y,1/30/2024
1,tipe kulitku kering,SOMETHINC CERAMIC SKIN Saviour Moisturizer Gel,bintang 5,A***y,1/30/2024
2,masalah kulitku kerutan,SOMETHINC HYALuronic9+ Advanced + B5 Serum,bintang 5,A***y,1/30/2024
3,"bagus banget lip tint nya warnanya jg bagus2, ...",SOMETHINC Holiday Lip Tint Kit (Ombrella Lip T...,bintang 5,rohimah,1/30/2024
4,"expire 2026, pengiriman cepat. cocok asal peng...","SOMETHINC AHA 7%, BHA 1%, PHA 3% Weekly Peelin...",bintang 5,n***a,1/30/2024


In [38]:
# Drop kolom yang tidak terpakai
df = df.drop(df.iloc[:,1:], axis=1)

# Cek data teratas
df.head()

Unnamed: 0,Review
0,masalah kulitku kerutan
1,tipe kulitku kering
2,masalah kulitku kerutan
3,"bagus banget lip tint nya warnanya jg bagus2, ..."
4,"expire 2026, pengiriman cepat. cocok asal peng..."


PREPROCESSING

In [39]:
# PROSES CASE FOLDING (menjadikan semua huruf di kolom Review menjadi huruf kecil)
df['Review'] = df['Review'].str.lower()

print('Case Folding Result: \n')
df.head()

Case Folding Result: 



Unnamed: 0,Review
0,masalah kulitku kerutan
1,tipe kulitku kering
2,masalah kulitku kerutan
3,"bagus banget lip tint nya warnanya jg bagus2, ..."
4,"expire 2026, pengiriman cepat. cocok asal peng..."


In [40]:
# import hasil case folding menjadi csv
df.to_csv("casefolding_somethinc.csv")

In [41]:
# CLEANING
import re # regex library

def remove_tweet_special(text):
    # remove tab, new line, and back slice
    text = text.replace('\\t'," ").replace('\\n', " ").replace('\\u', " ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")

df['Review'] = df['Review'].apply(remove_tweet_special)

# remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['Review'] = df['Review'].apply(remove_number)

# remove punctuation and replace with space
def remove_punctuation(text):
    return re.sub(r'[.,]', ' ', text)

df['Review'] = df['Review'].apply(remove_punctuation)

# remove punctuation
def remove_symbol(text):
    # Menghapus simbol-simbol tidak standar dan menggantinya dengan spasi
    cleaned_text = re.sub(r'[^\w\s]', ' ', text)
    
    # Menghapus multiple whitespace
    cleaned_text = re.sub('\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

df['Review'] = df['Review'].apply(remove_symbol)

# Menentukan ambang batas panjang string acak
threshold_length = 20

# Fungsi untuk menghapus string acak berdasarkan panjangnya
def hapus_string_acak_dengan_panjang(text):
    # Memeriksa panjang string dan menghapus jika melebihi ambang batas
    return ' '.join(word for word in text.split() if len(word) <= threshold_length)

# Menggunakan fungsi untuk menghapus string acak pada kolom 'Text'
df['Review'] = df['Review'].apply(hapus_string_acak_dengan_panjang)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['Review'] = df['Review'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['Review'] = df['Review'].apply(remove_whitespace_multiple)

# remove single char
def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['Review'] = df['Review'].apply(remove_single_char)

print('Cleaning Result : \n') 
print(df.head())

def remove_laughter(text):
    laughter_patterns = r'\b((ha)+h*|(he)+h*|(hi)+h*|(wk)+w*k*|(eh)+e*|(ah)+a*|(ih)+i*|(kw)+k*w*|(hem)+m*)\b'
    return re.sub(laughter_patterns, '', text, flags=re.IGNORECASE)

df['Review'] = df['Review'].apply(remove_laughter)

Cleaning Result : 

                                              Review
0                            masalah kulitku kerutan
1                                tipe kulitku kering
2                            masalah kulitku kerutan
3  bagus banget lip tint nya warnanya jg bagus ke...
4  expire pengiriman cepat cocok asal penggunaan ...


In [42]:
# import hasil cleaning menjadi csv
df.to_csv("cleaning_somethinc.csv")

In [25]:
# REMOVE DUPLICATE
df = df.drop_duplicates()
df = df.reset_index(drop=True)
# Menghapus baris yang kosong
df = df.dropna(subset=['Review'])
# Menghapus baris yang hanya berisi spasi atau whitespace
df = df[df['Review'].str.strip() != '']
df.head()

Unnamed: 0,Review
0,it beautiful package bintang buat safety nya
1,tipe kulitku kering
2,pengiriman agak lama karena beda provinsi tapi...
3,top
4,tipe kulitku berminyak tipe kulitku kering tip...


In [26]:
# import hasil remove duplicate menjadi csv
df.to_csv("removedup_avoskin.csv")

In [27]:
# NORMALIZATION
import pandas as pd
import re

slang_dictionary = pd.read_csv('../colloquial-indonesian-lexicon2.csv')
slang_dict = pd.Series(slang_dictionary['formal'].values,index=slang_dictionary['slang']).to_dict()

slang_dictionary.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1.0,wow,elongasi,0,0
1,aminn,amin,1.0,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1.0,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1.0,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0.0,Birthday yg keberpa kak?,abreviasi,0,0


In [28]:
# Normalisasi kata menggunakan kamus colloquial-indonesian-lexicon2.csv
def Slangwords(text, slang_dict):
    for word in text.split():
        if word in slang_dict.keys():
            # menambahkan \b untuk menandakan batas kata di sekitar kata slang
            text = re.sub(r'\b{}\b'.format(re.escape(word)), slang_dict[word], text)
    
    text = re.sub('@[\w]+', '', text)
    return text

df['Review_norm'] = df['Review'].apply(lambda x: Slangwords(x, slang_dict))
print(df.head())

                                              Review  \
0      it  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                         Review_norm  
0     itu  beautiful package bintang buat safety nya  
1                                tipe kulitku kering  
2  pengiriman agak lama karena beda provinsi tapi...  
3                                                top  
4  tipe kulitku berminyak tipe kulitku kering tip...  


In [29]:
# import hasil replace slang words menjadi csv
df.to_csv("normalization_avoskin.csv")

In [30]:
# PROSES TOKENIZING (word_tokenize() untuk memecah string kedalam tokens)

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Tokenizing
# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['review_tokens'] = df['Review_norm'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df.head())

Tokenizing Result : 

                                              Review  \
0      it  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                         Review_norm  \
0     itu  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                       review_tokens  
0  [itu, beautiful, package, bintang, buat, safet...  
1                            [tipe, kulitku, kering]  
2  [pengiriman, agak, lama, karena, beda, provins...  
3                                              [top]  
4  [tipe, kulitku, berminyak,

In [31]:
# import hasil tokenizing menjadi csv
df.to_csv("tokenizing_avoskin.csv")

In [32]:
# PROSES FILTERING (Stopword Removal)
from nltk.corpus import stopwords

# get stopword indonesia from NLTK stopword
list_stopwords = stopwords.words('indonesian')

# convert list to dictionary
list_stopwords = set(list_stopwords)

# remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df['review_tokens_SR'] = df['review_tokens'].apply(stopwords_removal) 

print(df.head())

                                              Review  \
0      it  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                         Review_norm  \
0     itu  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitku kering tip...   

                                       review_tokens  \
0  [itu, beautiful, package, bintang, buat, safet...   
1                            [tipe, kulitku, kering]   
2  [pengiriman, agak, lama, karena, beda, provins...   
3                                              [top]   
4  [tipe, kulitku, berminyak, tipe, kulitku, k

In [33]:
# import hasil stopword removal menjadi csv
df.to_csv("stopword_avoskin.csv")

In [34]:
# PROSES LEMMATIZATION 
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['review_tokens_SR']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['review_tokens_stemmed'] = df['review_tokens_SR'].swifter.apply(get_stemmed_term)
print(df['review_tokens_stemmed'])
print(df.head())

910
------------------------


beautiful : beautiful
package : package
bintang : bintang
safety : safety
nya : nya
tipe : tipe
kulitku : kulit
kering : kering
pengiriman : kirim
beda : beda
provinsi : provinsi
gapapa : gapapa
packingnya : packingnya
aman : aman
banget : banget
semoga : moga
cocok : cocok
memperbaiki : baik
kulit : kulit
berminyak : minyak
top : top
normal : normal
kombinasi : kombinasi
kerutan : kerut
jerawat : jerawat
pori : pori
sensitif : sensitif
exp : exp
no : no
review : review
found : found
sih : sih
beli : beli
bagus : bagus
ya : ya
muka : muka
kemasan : kemas
biru : biru
viral : viral
perskincare : perskincare
an : an
semenjak : semenjak
hamil : hamil
stop : stop
pakai : pakai
produk : produk
bayi : bayi
coba : coba
kangen : kangen
sensasinya : sensasi
memakai : pakai
feelnya : feelnya
ngaruh : ngaruh
suka : suka
muncul : muncul
barang : barang
sesuai : sesuai
pesanan : pesan
oke : oke
saran : saran
facial : facial
wash : wash
fragrance : fragrance
biar : biar
baunya : bau
enak : enak
nyama

Pandas Apply: 100%|██████████| 401/401 [00:00<00:00, 65006.61it/s]

0             [beautiful, package, bintang, safety, nya]
1                                  [tipe, kulit, kering]
2      [kirim, beda, provinsi, gapapa, packingnya, am...
3                                                  [top]
4      [tipe, kulit, minyak, tipe, kulit, kering, tip...
                             ...                        
397    [melembabkan, banget, tekstur, kental, cepat, ...
398                 [beli, cocok, kulit, bikin, iritasi]
399         [order, polypeptide, terima, aqua, ceramide]
400    [produk, sesuai, deskripsi, jual, recommended,...
401                 [pakai, cocok, bikin, kulit, kering]
Name: review_tokens_stemmed, Length: 401, dtype: object
                                              Review  \
0      it  beautiful package bintang buat safety nya   
1                                tipe kulitku kering   
2  pengiriman agak lama karena beda provinsi tapi...   
3                                                top   
4  tipe kulitku berminyak tipe kulitk




In [35]:
# import hasil preprocessing menjadi csv
df.to_csv("lemma_avoskin.csv")