In [18]:
# Load data ke dalam data frame 
import pandas as pd

# Spesifikasi encoding diperlukan karena data tidak menggunakan UTF-8
df = pd.read_csv('scraping_somethinc.csv', encoding='latin-1')

df.head()

Unnamed: 0,Review,Product,Rating,Username,Date
0,masalah kulitku kerutan,SOMETHINC BAKUCHIOL Skinpair Oil Serum,bintang 5,A***y,1/30/2024
1,tipe kulitku kering,SOMETHINC CERAMIC SKIN Saviour Moisturizer Gel,bintang 5,A***y,1/30/2024
2,masalah kulitku kerutan,SOMETHINC HYALuronic9+ Advanced + B5 Serum,bintang 5,A***y,1/30/2024
3,"bagus banget lip tint nya warnanya jg bagus2, ...",SOMETHINC Holiday Lip Tint Kit (Ombrella Lip T...,bintang 5,rohimah,1/30/2024
4,"expire 2026, pengiriman cepat. cocok asal peng...","SOMETHINC AHA 7%, BHA 1%, PHA 3% Weekly Peelin...",bintang 5,n***a,1/30/2024


In [19]:
# Drop kolom yang tidak terpakai
df = df.drop(df.iloc[:,1:], axis=1)

# Cek data teratas
df.head()

Unnamed: 0,Review
0,masalah kulitku kerutan
1,tipe kulitku kering
2,masalah kulitku kerutan
3,"bagus banget lip tint nya warnanya jg bagus2, ..."
4,"expire 2026, pengiriman cepat. cocok asal peng..."


PREPROCESSING

In [20]:
# PROSES CASE FOLDING (menjadikan semua huruf di kolom Review menjadi huruf kecil)
df['Review'] = df['Review'].str.lower()

print('Case Folding Result: \n')
df.head()

Case Folding Result: 



Unnamed: 0,Review
0,masalah kulitku kerutan
1,tipe kulitku kering
2,masalah kulitku kerutan
3,"bagus banget lip tint nya warnanya jg bagus2, ..."
4,"expire 2026, pengiriman cepat. cocok asal peng..."


In [21]:
# import hasil case folding menjadi csv
df.to_csv("casefolding_somethinc.csv")

In [22]:
# CLEANING
import re # regex library

def remove_tweet_special(text):
    # remove tab, new line, and back slice
    text = text.replace('\\t'," ").replace('\\n', " ").replace('\\u', " ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")

df['Review'] = df['Review'].apply(remove_tweet_special)

# remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['Review'] = df['Review'].apply(remove_number)

# remove punctuation and replace with space
def remove_punctuation(text):
    return re.sub(r'[.,]', ' ', text)

df['Review'] = df['Review'].apply(remove_punctuation)

# remove punctuation
def remove_symbol(text):
    # Menghapus simbol-simbol tidak standar dan menggantinya dengan spasi
    cleaned_text = re.sub(r'[^\w\s]', ' ', text)
    
    # Menghapus multiple whitespace
    cleaned_text = re.sub('\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

df['Review'] = df['Review'].apply(remove_symbol)

# Menentukan ambang batas panjang string acak
threshold_length = 20

# Fungsi untuk menghapus string acak berdasarkan panjangnya
def hapus_string_acak_dengan_panjang(text):
    # Memeriksa panjang string dan menghapus jika melebihi ambang batas
    return ' '.join(word for word in text.split() if len(word) <= threshold_length)

# Menggunakan fungsi untuk menghapus string acak pada kolom 'Text'
df['Review'] = df['Review'].apply(hapus_string_acak_dengan_panjang)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['Review'] = df['Review'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['Review'] = df['Review'].apply(remove_whitespace_multiple)

# remove single char
def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['Review'] = df['Review'].apply(remove_single_char)

def remove_laughter(text):
    laughter_patterns = r'\b((ha)+h*|(he)+h*|(hi)+h*|(wk)+w*k*|(eh)+e*|(ah)+a*|(ih)+i*|(kw)+k*w*|(hem)+m*)\b'
    return re.sub(laughter_patterns, '', text, flags=re.IGNORECASE)

df['Review'] = df['Review'].apply(remove_laughter)

print('Cleaning Result : \n') 
print(df.head())

Cleaning Result : 

                                              Review
0                            masalah kulitku kerutan
1                                tipe kulitku kering
2                            masalah kulitku kerutan
3  bagus banget lip tint nya warnanya jg bagus ke...
4  expire pengiriman cepat cocok asal penggunaan ...


In [23]:
# import hasil cleaning menjadi csv
df.to_csv("cleaning_somethinc.csv")

In [24]:
# REMOVE DUPLICATE
df = df.drop_duplicates()
df = df.reset_index(drop=True)
# Menghapus baris yang kosong
df = df.dropna(subset=['Review'])
# Menghapus baris yang hanya berisi spasi atau whitespace
df = df[df['Review'].str.strip() != '']
df.head()

Unnamed: 0,Review
0,masalah kulitku kerutan
1,tipe kulitku kering
2,bagus banget lip tint nya warnanya jg bagus ke...
3,expire pengiriman cepat cocok asal penggunaan ...
4,no review found


In [25]:
# import hasil remove duplicate menjadi csv
df.to_csv("removedup_somethinc.csv")

In [26]:
# NORMALIZATION
import pandas as pd
import re

slang_dictionary = pd.read_csv('../colloquial-indonesian-lexicon2.csv')
slang_dict = pd.Series(slang_dictionary['formal'].values,index=slang_dictionary['slang']).to_dict()

slang_dictionary.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1.0,wow,elongasi,0,0
1,aminn,amin,1.0,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1.0,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1.0,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0.0,Birthday yg keberpa kak?,abreviasi,0,0


In [27]:
# Normalisasi kata menggunakan kamus colloquial-indonesian-lexicon2.csv
def Slangwords(text, slang_dict):
    for word in text.split():
        if word in slang_dict.keys():
            # menambahkan \b untuk menandakan batas kata di sekitar kata slang
            text = re.sub(r'\b{}\b'.format(re.escape(word)), slang_dict[word], text)
    
    text = re.sub('@[\w]+', '', text)
    return text

df['Review_norm'] = df['Review'].apply(lambda x: Slangwords(x, slang_dict))
print(df.head())

                                              Review  \
0                            masalah kulitku kerutan   
1                                tipe kulitku kering   
2  bagus banget lip tint nya warnanya jg bagus ke...   
3  expire pengiriman cepat cocok asal penggunaan ...   
4                                    no review found   

                                         Review_norm  
0                            masalah kulitku kerutan  
1                                tipe kulitku kering  
2  bagus banget lip tint nya warnanya juga bagus ...  
3  expire pengiriman cepat cocok asal penggunaan ...  
4                                    no review found  


In [28]:
# import hasil replace slang words menjadi csv
df.to_csv("normalization_somethinc.csv")

In [29]:
# PROSES TOKENIZING (word_tokenize() untuk memecah string kedalam tokens)

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Tokenizing
# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['review_tokens'] = df['Review_norm'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df.head())

Tokenizing Result : 

                                              Review  \
0                            masalah kulitku kerutan   
1                                tipe kulitku kering   
2  bagus banget lip tint nya warnanya jg bagus ke...   
3  expire pengiriman cepat cocok asal penggunaan ...   
4                                    no review found   

                                         Review_norm  \
0                            masalah kulitku kerutan   
1                                tipe kulitku kering   
2  bagus banget lip tint nya warnanya juga bagus ...   
3  expire pengiriman cepat cocok asal penggunaan ...   
4                                    no review found   

                                       review_tokens  
0                        [masalah, kulitku, kerutan]  
1                            [tipe, kulitku, kering]  
2  [bagus, banget, lip, tint, nya, warnanya, juga...  
3  [expire, pengiriman, cepat, cocok, asal, pengg...  
4                            

In [30]:
# import hasil tokenizing menjadi csv
df.to_csv("tokenizing_somethinc.csv")

In [31]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
# PROSES FILTERING (Stopword Removal) menggunakan library sastrawi
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

# remove stopword pada list token
def stopwords_removal(words):
    filtered_words = [stopword.remove(w) for w in words]
    filtered_words = [word for word in filtered_words if word != '']  # Memfilter kata-kata kosong
    return filtered_words

df['review_tokens_SR'] = df['review_tokens'].apply(stopwords_removal) 

print(df.head())

                                              Review  \
0                            masalah kulitku kerutan   
1                                tipe kulitku kering   
2  bagus banget lip tint nya warnanya jg bagus ke...   
3  expire pengiriman cepat cocok asal penggunaan ...   
4                                    no review found   

                                         Review_norm  \
0                            masalah kulitku kerutan   
1                                tipe kulitku kering   
2  bagus banget lip tint nya warnanya juga bagus ...   
3  expire pengiriman cepat cocok asal penggunaan ...   
4                                    no review found   

                                       review_tokens  \
0                        [masalah, kulitku, kerutan]   
1                            [tipe, kulitku, kering]   
2  [bagus, banget, lip, tint, nya, warnanya, juga...   
3  [expire, pengiriman, cepat, cocok, asal, pengg...   
4                                [no, review, 

In [32]:
# import hasil stopword removal menjadi csv
df.to_csv("stopword_somethinc.csv")

In [33]:
# PROSES LEMMATIZATION 
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['review_tokens_SR']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['review_tokens_stemmed'] = df['review_tokens_SR'].swifter.apply(get_stemmed_term)
print(df['review_tokens_stemmed'])
print(df.head())

1054
------------------------
masalah : masalah
kulitku : kulit
kerutan : kerut
tipe : tipe
kering : kering
bagus : bagus
banget : banget
lip : lip
tint : tint
nya : nya
warnanya : warna
kemasannya : kemas
gemas : gemas
kecil : kecil
terima : terima
kasih : kasih
expire : expire
pengiriman : kirim
cepat : cepat
cocok : cocok
asal : asal
penggunaan : guna
sesuai : sesuai
instruksi : instruksi
no : no
review : review
found : found
gift : gift
pembelian : beli
lucu : lucu
suka : suka
ringan : ringan
kayak : kayak
enggak : enggak
pakai : pakai
apa : apa
terus : terus
benaran : benar
transferproof : transferproof
mana : mana
design : design
sensitif : sensitif
bikin : bikin
sama : sama
sekali : sekali
datang : datang
estimasi : estimasi
packing : packing
aman : aman
jne : jne
tokopedia : tokopedia
pertama : pertama
coba : coba
nih : nih
serum : serum
moga : moga
kurir : kurir
top : top
berminyak : minyak
jerawat : jerawat
pori : pori
besar : besar
retinolnya : retinolnya
somethinc : somethi

Pandas Apply: 100%|██████████| 394/394 [00:00<00:00, 66210.82it/s]

0                                [masalah, kulit, kerut]
1                                  [tipe, kulit, kering]
2      [bagus, banget, lip, tint, nya, warna, bagus, ...
3      [expire, kirim, cepat, cocok, asal, guna, sesu...
4                                    [no, review, found]
                             ...                        
390    [tipe, kulit, minyak, alhamdulillah, cocok, th...
391    [beli, serum, pas, harga, flash, sale, lumayan...
392    [jujur, buat, aku, enggak, cocok, aku, gapaham...
393    [hasil, akhir, bagus, kayak, kulit, bersih, se...
394                                  [bagus, tahan, air]
Name: review_tokens_stemmed, Length: 394, dtype: object
                                              Review  \
0                            masalah kulitku kerutan   
1                                tipe kulitku kering   
2  bagus banget lip tint nya warnanya jg bagus ke...   
3  expire pengiriman cepat cocok asal penggunaan ...   
4                                    




In [34]:
# Menghapus baris yang kosong
df = df.dropna(subset=['review_tokens_stemmed'])
# Menghapus baris yang hanya berisi spasi atau whitespace
df = df[df['review_tokens_stemmed'].str.strip() != '']
# Menghapus baris yang memiliki list kosong
df = df[df['review_tokens_stemmed'].apply(lambda x: x != [])]
df.head(15)

Unnamed: 0,Review,Review_norm,review_tokens,review_tokens_SR,review_tokens_stemmed
0,masalah kulitku kerutan,masalah kulitku kerutan,"[masalah, kulitku, kerutan]","[masalah, kulitku, kerutan]","[masalah, kulit, kerut]"
1,tipe kulitku kering,tipe kulitku kering,"[tipe, kulitku, kering]","[tipe, kulitku, kering]","[tipe, kulit, kering]"
2,bagus banget lip tint nya warnanya jg bagus ke...,bagus banget lip tint nya warnanya juga bagus ...,"[bagus, banget, lip, tint, nya, warnanya, juga...","[bagus, banget, lip, tint, nya, warnanya, bagu...","[bagus, banget, lip, tint, nya, warna, bagus, ..."
3,expire pengiriman cepat cocok asal penggunaan ...,expire pengiriman cepat cocok asal penggunaan ...,"[expire, pengiriman, cepat, cocok, asal, pengg...","[expire, pengiriman, cepat, cocok, asal, pengg...","[expire, kirim, cepat, cocok, asal, guna, sesu..."
4,no review found,no review found,"[no, review, found]","[no, review, found]","[no, review, found]"
5,dapet gift setiap pembelian lucu bgt sukaaa ma...,dapat gift setiap pembelian lucu banget suka t...,"[dapat, gift, setiap, pembelian, lucu, banget,...","[gift, pembelian, lucu, banget, suka, terima, ...","[gift, beli, lucu, banget, suka, terima, kasih]"
6,ringan banget kaya ga pake apa apa terus bener...,ringan banget kayak enggak pakai apa apa terus...,"[ringan, banget, kayak, enggak, pakai, apa, ap...","[ringan, banget, kayak, enggak, pakai, apa, ap...","[ringan, banget, kayak, enggak, pakai, apa, ap..."
7,tipe kulitku sensitif dan cocok banget pake in...,tipe kulitku sensitif dan cocok banget pakai i...,"[tipe, kulitku, sensitif, dan, cocok, banget, ...","[tipe, kulitku, sensitif, cocok, banget, pakai...","[tipe, kulit, sensitif, cocok, banget, pakai, ..."
8,datang sesuai estimasi pengiriman packing aman...,datang sesuai estimasi pengiriman packing aman...,"[datang, sesuai, estimasi, pengiriman, packing...","[datang, sesuai, estimasi, pengiriman, packing...","[datang, sesuai, estimasi, kirim, packing, ama..."
9,pertama coba nih serum moga cocok packing aman...,pertama coba nih serum moga cocok packing aman...,"[pertama, coba, nih, serum, moga, cocok, packi...","[pertama, coba, nih, serum, moga, cocok, packi...","[pertama, coba, nih, serum, moga, cocok, packi..."


In [35]:
# import hasil preprocessing menjadi csv
df.to_csv("lemma_somethinc.csv")