In [18]:
import nltk
import time

import pandas as pd
import numpy as np

from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [9]:
# Init NLTK
# Pada tahap ini akan tampil NLTK downloader
# Jikal belum perdah download, harus download beberapa modul terlebih dahulu
# Jika sudah tahap ini dapat dilewati
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
# Membaca dataset.csv kedalah dataframe pandas
df = pd.read_csv("dataset.csv")

# print(df.head(5))
print(df.isna().any())

Unnamed: 0                 False
clothing_id                False
age                        False
title                       True
review_text                 True
rating                     False
recommended_ind            False
positive_feedback_count    False
division_name               True
department_name             True
class_name                  True
dtype: bool


In [3]:
# Cleaning
# Karena data yang digunakan hanya kolom "review_text" 
# maka hanya data pada kolom itu yang akan dibersihkan dari nilai NaN.
# Pembersihan data ini akan menghapus baris yang memiliki nilai NaN
# pada kolom "review_text".
df.dropna(axis=0, subset=["review_text"], inplace=True)

print(df.isna().any())

Unnamed: 0                 False
clothing_id                False
age                        False
title                       True
review_text                False
rating                     False
recommended_ind            False
positive_feedback_count    False
division_name               True
department_name             True
class_name                  True
dtype: bool


In [4]:
# Case Folding
# Mengecilkan semuah huruf pada kolom "review_text"
df["review_text"] = df["review_text"].str.lower()

print(df["review_text"])

0        absolutely wonderful - silky and sexy and comf...
1        love this dress!  it's sooo pretty.  i happene...
2        i had such high hopes for this dress and reall...
3        i love, love, love this jumpsuit. it's fun, fl...
4        this shirt is very flattering to all due to th...
                               ...                        
23481    i was very happy to snag this dress at such a ...
23482    it reminds me of maternity clothes. soft, stre...
23483    this fit well, but the top was very see throug...
23484    i bought this dress for a wedding i have this ...
23485    this dress in a lovely platinum is feminine an...
Name: review_text, Length: 22641, dtype: object


In [5]:
# Tokenizing
# Pembuatan token kata akan dilakukan setelah text masing-masing dokumen
# dilakukan penghilangan karakter-karakter yang tidak penting

In [6]:
import string
import re

def hapus_karakter_spesial(text):
    # hapus karakter yang bukan ASCII
    return text.encode('ascii', 'replace').decode('ascii')

def hapus_karakter_angka(text):
    return re.sub(r'\d+', "", text)

def hapus_tanda_baca(text):
    return text.translate(str.maketrans("", "", string.punctuation))

def hapus_whitespace(text):
    return re.sub(r'\s+', ' ', text)

def hapus_whitespace_kiri_kanan(text):
    return text.strip()

In [7]:
# Menghilangkan karakter tidak penting
df["review_text"] = df["review_text"].apply(hapus_karakter_spesial)
df["review_text"] = df["review_text"].apply(hapus_karakter_angka)
df["review_text"] = df["review_text"].apply(hapus_tanda_baca)
df["review_text"] = df["review_text"].apply(hapus_whitespace)
df["review_text"] = df["review_text"].apply(hapus_whitespace_kiri_kanan)

# Membuat token
df['review_text_tokens'] = df['review_text'].apply(lambda text: word_tokenize(text))

print(df["review_text"])

0        absolutely wonderful silky and sexy and comfor...
1        love this dress its sooo pretty i happened to ...
2        i had such high hopes for this dress and reall...
3        i love love love this jumpsuit its fun flirty ...
4        this shirt is very flattering to all due to th...
                               ...                        
23481    i was very happy to snag this dress at such a ...
23482    it reminds me of maternity clothes soft stretc...
23483    this fit well but the top was very see through...
23484    i bought this dress for a wedding i have this ...
23485    this dress in a lovely platinum is feminine an...
Name: review_text, Length: 22641, dtype: object


In [17]:
# Stopwords
# Pada tahap ini akan menghilangkan stopwords
# stopwords yang digunakan bersal dari library NLTK

# Mendapatkan list stopwords dari library NLTK
list_stopwords = stopwords.words('english')
list_stopwords = set(list_stopwords)

# Eliminasi kata yang termasuk stopwords
df['review_text_tokens_wsw'] = df['review_text_tokens'].apply(
    lambda words: [word for word in words if word not in list_stopwords]
)

print(df['review_text_tokens_wsw'].head())

0    [absolutely, wonderful, silky, sexy, comfortable]
1    [love, dress, sooo, pretty, happened, find, st...
2    [high, hopes, dress, really, wanted, work, ini...
3    [love, love, love, jumpsuit, fun, flirty, fabu...
4    [shirt, flattering, due, adjustable, front, ti...
Name: review_text_tokens_wsw, dtype: object


In [20]:
# Stemming
# Tahap ini akan mengembalikan kata kebentuk dasarnya
# di sini akan menggunakan snowball stemmer

# Menghitung waktu stemming
start_time = time.time()

stemmer = SnowballStemmer(language="english")
df["review_text_tokens_stemmed"] = df["review_text_tokens_wsw"].apply(
    lambda words: [stemmer.stem(word) for word in words]
)

print("--- %s seconds ---" % (time.time() - start_time))
print(df["review_text_tokens_stemmed"])

--- 11.880393505096436 seconds ---
0                  [absolut, wonder, silki, sexi, comfort]
1        [love, dress, sooo, pretti, happen, find, stor...
2        [high, hope, dress, realli, want, work, initi,...
3        [love, love, love, jumpsuit, fun, flirti, fabu...
4        [shirt, flatter, due, adjust, front, tie, perf...
                               ...                        
23481    [happi, snag, dress, great, price, easi, slip,...
23482    [remind, matern, cloth, soft, stretchi, shini,...
23483    [fit, well, top, see, never, would, work, im, ...
23484    [bought, dress, wed, summer, cute, unfortun, f...
23485    [dress, love, platinum, feminin, fit, perfect,...
Name: review_text_tokens_stemmed, Length: 22641, dtype: object


In [21]:
# Menyimpan hasil prepocessing
df.to_csv("text_preprocessing.csv")