In [30]:
%pip install openpyxl
%pip install xlrd
%pip install googletrans==3.0.0
%pip install langdetect




Note: you may need to restart the kernel to use updated packages.


In [31]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

# Membaca data dari file Excel
data = pd.read_excel('merge_11_cabang.xlsx')

def clean_text(content):
    if isinstance(content, str):
        # Menghapus newline karakter
        content = content.replace('\n', ' ')
        
        # Menghapus spasi berlebihan
        content = re.sub(r'\s+', ' ', content)

        # Menghapus emoji
        content = re.sub(r':[^:]+:', '', content)

        # Menghapus hashtag
        content = re.sub(r'#\w+', '', content)

        # Menghapus mention
        content = re.sub(r'@[A-Za-z0-9]+', '', content)

        # Menghapus URL
        content = re.sub(r'https?://[A-Za-z0-9./]+', '', content)

        # Menghapus karakter selain huruf dan spasi, menambahkan spasi di sekitar
        content = re.sub(r'[^A-Za-z\s]', ' ', content)

        # Menghapus tanggal, angka, dan datetime
        content = re.sub(r'\b\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}\b', ' ', content)  # Menghapus tanggal
        content = re.sub(r'\b\d+\b', ' ', content)  # Menghapus angka
        content = re.sub(r'\b\d{1,2}:\d{1,2}(:\d{1,2})?\b', ' ', content)  # Menghapus format waktu
        
        # Menghilangkan huruf yang berlebihan (contoh: "baguuuus" -> "bagus")
        content = re.sub(r'(.)\1{2,}', r'\1', content)
        
        # Menghapus spasi berlebihan setelah pembersihan
        content = re.sub(r'\s+', ' ', content).strip()

        return content
    else:
        return ""

# Menggunakan fungsi clean_text pada DataFrame
tqdm.pandas()
data['Clean_Review'] = data['snippet'].progress_apply(lambda x: clean_text(x))

# Menghapus baris yang kosong
data['Clean_Review'] = data['Clean_Review'].apply(lambda x: np.nan if pd.isnull(x) or x.strip() == '' else x)

# Menghapus baris dengan nilai yang hilang pada kolom penting
data.dropna(subset=['user_name', 'date', 'snippet', 'Clean_Review'], inplace=True)

# Menyimpan hasil ke file Excel
data.to_excel('clean_master_data_cleansing.xlsx', index=False)

# Menampilkan hasil pembersihan
print(data[['user_name', 'date', 'snippet', 'Clean_Review']].head())


100%|██████████| 1048/1048 [00:00<00:00, 14415.31it/s]


            user_name          date  \
0       Lucky Rivanto  7 months ago   
1  Novita Catur putri   2 years ago   
2       aditiya risky   2 years ago   
3          Fita Julia   2 years ago   
4      M.choirul arif  6 months ago   

                                             snippet  \
0                                                 Ok   
1                                             Nice 🤗   
2                              very friendly service   
3                                        Very good♥️   
4  Harga lmyan terjangkau, tempatnya bersih ada p...   

                                        Clean_Review  
0                                                 Ok  
1                                               Nice  
2                              very friendly service  
3                                          Very good  
4  Harga lmyan terjangkau tempatnya bersih ada pl...  


In [32]:
from googletrans import Translator
from langdetect import detect
import pandas as pd
import re

translator = Translator()

# Fungsi untuk mendeteksi bahasa dan menerjemahkan hanya teks yang bukan bahasa Indonesia
def translate_review(text):
    try:
        # Pisahkan kalimat yang mungkin dalam bahasa Inggris atau campuran
        sentences = re.split(r'(?<=[.!?]) +', text)  # Memisahkan kalimat berdasarkan tanda baca
        translated_sentences = []

        for sentence in sentences:
            try:
                # Deteksi bahasa dari kalimat
                lang = detect(sentence)
            except:
                # Jika gagal mendeteksi, asumsi bahasa Indonesia
                lang = 'id'
            
            if lang == 'en':  # Jika terdeteksi bahasa Inggris, terjemahkan ke Indonesia
                translated = translator.translate(sentence, src='en', dest='id')
                translated_sentences.append(translated.text.lower())  # Convert to lowercase
            else:
                translated_sentences.append(sentence.lower())  # Kembalikan kalimat asli dalam lowercase

        return ' '.join(translated_sentences)  # Gabungkan kembali kalimat
    except Exception as e:
        return text.lower()  # Jika terjadi error, kembalikan teks asli dalam lowercase

# Mengimpor dataset
data = pd.read_excel('clean_master_data_cleansing.xlsx')

# Melakukan terjemahan menggunakan Google Translate hanya untuk kalimat bahasa Inggris
data['Translate_Review'] = data['Clean_Review'].apply(translate_review)

# Menyimpan data yang telah diterjemahkan ke file Excel yang sama
data.to_excel('clean_master_data_translate.xlsx', index=False)

# Menampilkan beberapa baris pertama data
print(data[['Clean_Review', 'Translate_Review']].head())

                                        Clean_Review  \
0                                                 Ok   
1                                               Nice   
2                              very friendly service   
3                                          Very good   
4  Harga lmyan terjangkau tempatnya bersih ada pl...   

                                    Translate_Review  
0                                                 ok  
1                                               nice  
2                              very friendly service  
3                                          very good  
4  harga lmyan terjangkau tempatnya bersih ada pl...  


# Tokenization

In [33]:
import pandas as pd
import re

# Memuat dataset
data = pd.read_excel('clean_master_data_translate.xlsx')

# Fungsi untuk melakukan tokenisasi menggunakan regex
def tokenize(text):
    if isinstance(text, str):  # Pastikan hanya memproses string
        # Tokenisasi dengan regex untuk menangkap kata-kata dan angka
        return re.findall(r'\b\w+\b', text.lower())  # Menambahkan .lower() untuk konsistensi
    else:
        return []  # Jika bukan string (misalnya NaN), kembalikan list kosong

# Memastikan kolom 'Corrected_Review' ada sebelum menerapkan tokenisasi
if 'Translate_Review' in data.columns:
    # Menerapkan tokenisasi pada kolom 'Translate_Review'
    data['Tokens_Review'] = data['Translate_Review'].apply(tokenize)

    # Menyimpan data yang telah diubah ke file Excel
    data.to_excel('clean_master_data_token.xlsx', index=False)

    # Tampilkan hasil tokenisasi untuk memverifikasi
    print(data[['Translate_Review', 'Tokens_Review']].head())
else:
    print("Kolom 'Translate_Review' tidak ditemukan dalam dataset.")


                                    Translate_Review  \
0                                                 ok   
1                                               nice   
2                              very friendly service   
3                                          very good   
4  harga lmyan terjangkau tempatnya bersih ada pl...   

                                       Tokens_Review  
0                                               [ok]  
1                                             [nice]  
2                          [very, friendly, service]  
3                                       [very, good]  
4  [harga, lmyan, terjangkau, tempatnya, bersih, ...  


# Normalization

In [34]:
import pandas as pd
import re

# Load normalization words from GitHub raw CSV file
url = 'https://raw.githubusercontent.com/adeariniputri/text-preprocesing/master/slang.csv'
normalization_df = pd.read_csv(url)
WORDS = dict(zip(normalization_df['slang'].str.lower(), normalization_df['formal']))

# Remove repeated characters and normalize text
def remove_repeated_chars(word):
    return re.sub(r'(.)\1{2,}', r'\1', word)

def correct_text(tokens):
    corrected_words = []
    for word in tokens:
        normalized_word = remove_repeated_chars(word.lower())
        corrected_word = WORDS.get(normalized_word, normalized_word)  # Get formal equivalent
        print(f"Normalized: {normalized_word} -> Corrected: {corrected_word}")  # Debugging
        corrected_words.append(corrected_word)
    return corrected_words

# Load dataset
data = pd.read_excel('clean_master_data_token.xlsx')

# Apply corrections on 'Tokens_Review' column
data['Corrected_Review'] = data['Tokens_Review'].apply(lambda x: correct_text(eval(x)) if pd.notna(x) else [])

# Save to Excel
data.to_excel('clean_master_data_corrected.xlsx', index=False)

# Print sample for verification
print(data[['Tokens_Review', 'Corrected_Review']].head())


Normalized: ok -> Corrected: ok
Normalized: nice -> Corrected: baik
Normalized: very -> Corrected: sangat
Normalized: friendly -> Corrected: friendly
Normalized: service -> Corrected: layan
Normalized: very -> Corrected: sangat
Normalized: good -> Corrected: good
Normalized: harga -> Corrected: harga
Normalized: lmyan -> Corrected: lmyan
Normalized: terjangkau -> Corrected: terjangkau
Normalized: tempatnya -> Corrected: tempatnya
Normalized: bersih -> Corrected: bersih
Normalized: ada -> Corrected: ada
Normalized: playground -> Corrected: playground
Normalized: untuk -> Corrected: untuk
Normalized: anak -> Corrected: anak
Normalized: pelayanan -> Corrected: pelayanan
Normalized: ramah -> Corrected: ramah
Normalized: n -> Corrected: n
Normalized: sat -> Corrected: sat
Normalized: set -> Corrected: setengah
Normalized: tempat -> Corrected: tempat
Normalized: parkir -> Corrected: parkir
Normalized: lumayan -> Corrected: lumayan
Normalized: luas -> Corrected: luas
Normalized: tempatnya -> 

In [35]:
data.head()

Unnamed: 0,Source.Name,user_name,rating,snippet,date,iso_date,iso_date_of_last_edit,Clean_Review,Translate_Review,Tokens_Review,Corrected_Review
0,reviews_bogo.xlsx,Lucky Rivanto,5,Ok,7 months ago,2024-03-18 09:57:31,2024-03-18 09:57:31,Ok,ok,['ok'],[ok]
1,reviews_bogo.xlsx,Novita Catur putri,5,Nice 🤗,2 years ago,2022-07-07 11:04:13,2022-07-07 11:04:13,Nice,nice,['nice'],[baik]
2,reviews_bogo.xlsx,aditiya risky,5,very friendly service,2 years ago,2022-09-25 20:13:30,2022-09-25 20:13:30,very friendly service,very friendly service,"['very', 'friendly', 'service']","[sangat, friendly, layan]"
3,reviews_bogo.xlsx,Fita Julia,5,Very good♥️,2 years ago,2022-07-07 12:59:32,2022-07-07 12:59:32,Very good,very good,"['very', 'good']","[sangat, good]"
4,reviews_bogo.xlsx,M.choirul arif,5,"Harga lmyan terjangkau, tempatnya bersih ada p...",6 months ago,2024-05-01 17:00:28,2024-05-01 17:00:28,Harga lmyan terjangkau tempatnya bersih ada pl...,harga lmyan terjangkau tempatnya bersih ada pl...,"['harga', 'lmyan', 'terjangkau', 'tempatnya', ...","[harga, lmyan, terjangkau, tempatnya, bersih, ..."


# Stopwords

In [36]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import ast
import requests

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Load additional stopwords from GitHub
url = 'https://raw.githubusercontent.com/nolimitid/nolimit-kamus/master/indonesian-stopwords-complete.txt'
response = requests.get(url)
additional_stopwords = set(response.text.splitlines())

# Load data from Excel file
data = pd.read_excel('clean_master_data_corrected.xlsx')

# Combine NLTK's stopwords and additional stopwords
stop_words = set(stopwords.words('indonesian'))
stop_words.update(additional_stopwords)  # Add additional stopwords

# Define words to keep
keep_words = {'kurang'}  # Add more words as needed

# Function to remove stopwords from the Corrected_Review column
def remove_stopwords(Corrected):
    token_list = ast.literal_eval(Corrected) if isinstance(Corrected, str) else Corrected
    return [word for word in token_list if word.lower() not in stop_words or word.lower() in keep_words]

# Apply stopword removal
tqdm.pandas()  # Initialize tqdm
data['Stopwords_Review'] = data['Corrected_Review'].progress_apply(remove_stopwords)

# Drop rows with NaN in Stopwords_Review
data.dropna(subset=['Stopwords_Review'], inplace=True)

# Save the results to a new file
data.to_excel('clean_master_data_no_stopwords.xlsx', index=False)

# Display sample results
print(data.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 760/760 [00:00<00:00, 6987.32it/s]


         Source.Name           user_name  rating  \
0  reviews_bogo.xlsx       Lucky Rivanto       5   
1  reviews_bogo.xlsx  Novita Catur putri       5   
2  reviews_bogo.xlsx       aditiya risky       5   
3  reviews_bogo.xlsx          Fita Julia       5   
4  reviews_bogo.xlsx      M.choirul arif       5   

                                             snippet          date  \
0                                                 Ok  7 months ago   
1                                             Nice 🤗   2 years ago   
2                              very friendly service   2 years ago   
3                                        Very good♥️   2 years ago   
4  Harga lmyan terjangkau, tempatnya bersih ada p...  6 months ago   

             iso_date iso_date_of_last_edit  \
0 2024-03-18 09:57:31   2024-03-18 09:57:31   
1 2022-07-07 11:04:13   2022-07-07 11:04:13   
2 2022-09-25 20:13:30   2022-09-25 20:13:30   
3 2022-07-07 12:59:32   2022-07-07 12:59:32   
4 2024-05-01 17:00:28   2024-05

In [37]:
data.head()

Unnamed: 0,Source.Name,user_name,rating,snippet,date,iso_date,iso_date_of_last_edit,Clean_Review,Translate_Review,Tokens_Review,Corrected_Review,Stopwords_Review
0,reviews_bogo.xlsx,Lucky Rivanto,5,Ok,7 months ago,2024-03-18 09:57:31,2024-03-18 09:57:31,Ok,ok,['ok'],['ok'],[ok]
1,reviews_bogo.xlsx,Novita Catur putri,5,Nice 🤗,2 years ago,2022-07-07 11:04:13,2022-07-07 11:04:13,Nice,nice,['nice'],['baik'],[]
2,reviews_bogo.xlsx,aditiya risky,5,very friendly service,2 years ago,2022-09-25 20:13:30,2022-09-25 20:13:30,very friendly service,very friendly service,"['very', 'friendly', 'service']","['sangat', 'friendly', 'layan']","[friendly, layan]"
3,reviews_bogo.xlsx,Fita Julia,5,Very good♥️,2 years ago,2022-07-07 12:59:32,2022-07-07 12:59:32,Very good,very good,"['very', 'good']","['sangat', 'good']",[good]
4,reviews_bogo.xlsx,M.choirul arif,5,"Harga lmyan terjangkau, tempatnya bersih ada p...",6 months ago,2024-05-01 17:00:28,2024-05-01 17:00:28,Harga lmyan terjangkau tempatnya bersih ada pl...,harga lmyan terjangkau tempatnya bersih ada pl...,"['harga', 'lmyan', 'terjangkau', 'tempatnya', ...","['harga', 'lmyan', 'terjangkau', 'tempatnya', ...","[harga, lmyan, terjangkau, tempatnya, bersih, ..."


# Stemming

In [38]:
# Mengimpor library
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Membuat stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Mengimpor dataset
data = pd.read_excel('clean_master_data_no_stopwords.xlsx')

# Fungsi untuk stemming
def stemming(text):
    if isinstance(text, str):  # Cek apakah tipe data adalah string
        return stemmer.stem(text)
    return text  # Kembalikan nilai asli jika bukan string (misal NaN)

# Menerapkan stemming pada kolom 'Stopword_Review'
data['Stemming_Review'] = data['Stopwords_Review'].apply(stemming)

# Menyimpan data yang telah diubah ke file Excel yang sama
data.to_excel('clean_master_data_stemming.xlsx', index=False)

# Tampilkan hasil
data.head()

Unnamed: 0,Source.Name,user_name,rating,snippet,date,iso_date,iso_date_of_last_edit,Clean_Review,Translate_Review,Tokens_Review,Corrected_Review,Stopwords_Review,Stemming_Review
0,reviews_bogo.xlsx,Lucky Rivanto,5,Ok,7 months ago,2024-03-18 09:57:31,2024-03-18 09:57:31,Ok,ok,['ok'],['ok'],['ok'],ok
1,reviews_bogo.xlsx,Novita Catur putri,5,Nice 🤗,2 years ago,2022-07-07 11:04:13,2022-07-07 11:04:13,Nice,nice,['nice'],['baik'],[],
2,reviews_bogo.xlsx,aditiya risky,5,very friendly service,2 years ago,2022-09-25 20:13:30,2022-09-25 20:13:30,very friendly service,very friendly service,"['very', 'friendly', 'service']","['sangat', 'friendly', 'layan']","['friendly', 'layan']",friendly layan
3,reviews_bogo.xlsx,Fita Julia,5,Very good♥️,2 years ago,2022-07-07 12:59:32,2022-07-07 12:59:32,Very good,very good,"['very', 'good']","['sangat', 'good']",['good'],good
4,reviews_bogo.xlsx,M.choirul arif,5,"Harga lmyan terjangkau, tempatnya bersih ada p...",6 months ago,2024-05-01 17:00:28,2024-05-01 17:00:28,Harga lmyan terjangkau tempatnya bersih ada pl...,harga lmyan terjangkau tempatnya bersih ada pl...,"['harga', 'lmyan', 'terjangkau', 'tempatnya', ...","['harga', 'lmyan', 'terjangkau', 'tempatnya', ...","['harga', 'lmyan', 'terjangkau', 'tempatnya', ...",harga lmyan jangkau tempat bersih playground a...


Filtering 
<!-- masih error -->

In [39]:
# from nltk.corpus import stopwords

# # ----------------------- get stopword from NLTK stopword -------------------------------
# # get stopword indonesia
# list_stopwords = stopwords.words('indonesian')


# # ---------------------------- manualy add stopword  ------------------------------------
# # append additional stopword
# list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", 'klo', 
#                        'kalo', 'amp', 'biar', 'bikin', 'bilang', 
#                        'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
#                        'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
#                        'jd', 'jgn', 'sdh', 'aja', 
#                        'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
#                        '&amp', 'yah', 'sdgkan', 'sdg', 'emg', 'sm', 'pls', 'mlu', 'ken',
#                        'allah', 'brb', 'btw', 'b/c', 'cod', 'cmiiw', 'fyi',
#                        'gg', 'ggwp', 'idk', 'ikr', 'lol', 'ootd', 'lmao', 'oot',
#                        'pap', 'otw', 'tfl', 'vc', 'ygy'])

# # ----------------------- add stopword from txt file ------------------------------------
# # read txt stopword using pandas
# txt_stopword = pd.read_csv("stopwordbahasa.txt", names= ["stopwords"], header = None)

# # convert stopword string to list & append additional stopword
# list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# # ---------------------------------------------------------------------------------------

# # convert list to dictionary
# list_stopwords = set(list_stopwords)


# #remove stopword pada list token
# def stopwords_removal(words):
#     return [word for word in words if word not in list_stopwords]

# data['Text Filtering'] = data['Stemming_Review'].apply(stopwords_removal) 


# print(data['Text Filtering'].head())

View Preprocessing Results

In [40]:
data.head()

Unnamed: 0,Source.Name,user_name,rating,snippet,date,iso_date,iso_date_of_last_edit,Clean_Review,Translate_Review,Tokens_Review,Corrected_Review,Stopwords_Review,Stemming_Review
0,reviews_bogo.xlsx,Lucky Rivanto,5,Ok,7 months ago,2024-03-18 09:57:31,2024-03-18 09:57:31,Ok,ok,['ok'],['ok'],['ok'],ok
1,reviews_bogo.xlsx,Novita Catur putri,5,Nice 🤗,2 years ago,2022-07-07 11:04:13,2022-07-07 11:04:13,Nice,nice,['nice'],['baik'],[],
2,reviews_bogo.xlsx,aditiya risky,5,very friendly service,2 years ago,2022-09-25 20:13:30,2022-09-25 20:13:30,very friendly service,very friendly service,"['very', 'friendly', 'service']","['sangat', 'friendly', 'layan']","['friendly', 'layan']",friendly layan
3,reviews_bogo.xlsx,Fita Julia,5,Very good♥️,2 years ago,2022-07-07 12:59:32,2022-07-07 12:59:32,Very good,very good,"['very', 'good']","['sangat', 'good']",['good'],good
4,reviews_bogo.xlsx,M.choirul arif,5,"Harga lmyan terjangkau, tempatnya bersih ada p...",6 months ago,2024-05-01 17:00:28,2024-05-01 17:00:28,Harga lmyan terjangkau tempatnya bersih ada pl...,harga lmyan terjangkau tempatnya bersih ada pl...,"['harga', 'lmyan', 'terjangkau', 'tempatnya', ...","['harga', 'lmyan', 'terjangkau', 'tempatnya', ...","['harga', 'lmyan', 'terjangkau', 'tempatnya', ...",harga lmyan jangkau tempat bersih playground a...


Save Preprocessing Results

In [41]:
data.to_csv("preprocessing_5_results", index=False)

Delete Unnecessary Data

In [42]:
# df= data.drop(columns=['Text', 'Text Case Folding', 'Text Tokenizing', 'Text Normalization', 'Text Stemming'])
# df.head()
df = data.filter(items=['Stemming_Review'])
df.head()

Unnamed: 0,Stemming_Review
0,ok
1,
2,friendly layan
3,good
4,harga lmyan jangkau tempat bersih playground a...


# Labeling Using Inset Lexicon

In [None]:
lexicon_positive = pd.read_excel('kamus_positive.xlsx')
lexicon_positive_dict = {}
for index, row in lexicon_positive.iterrows():
    if row[0] not in lexicon_positive_dict:
        lexicon_positive_dict[row[0]] = row[1]

lexicon_negative = pd.read_excel('kamus_negative.xlsx')
lexicon_negative_dict = {}
for index, row in lexicon_negative.iterrows():
    if row[0] not in lexicon_negative_dict:
        lexicon_negative_dict[row[0]] = row[1]

def sentiment_analysis_lexicon_indonesia(text):
    score = 0
    for word in text:
        if (word in lexicon_positive_dict):
            score = score + lexicon_positive_dict[word]
    for word in text:
        if (word in lexicon_negative_dict):
            score = score + lexicon_negative_dict[word]
    sentimen=''
    if (score > 0):
        sentimen = 'Positive'
    elif (score < 0):
        sentimen = 'Negative'
    else:
        sentimen = 'Neutral'
    return score, sentimen

results = df['Stemming_Review'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
df['Polarity Score'] = results[0]
df['Indonesia Sentiment'] = results[1]
#data['sentimen'] = results[1]
#data

df[['Stemming_Review','Polarity Score','Indonesia Sentiment']]

  if row[0] not in lexicon_positive_dict:
  lexicon_positive_dict[row[0]] = row[1]
  if row[0] not in lexicon_negative_dict:
  lexicon_negative_dict[row[0]] = row[1]


KeyError: "['Corrected_Review'] not in index"

## Sentiment Cumulative Results

In [None]:
inset_counts = df['Indonesia Sentiment'].value_counts()
inset_counts

Indonesia Sentiment
Neutral    760
Name: count, dtype: int64

## Save Labeling Results

In [None]:
# aaa