# Preparation

Install & Import Library

In [1]:
!pip install Sastrawi
import re
import pandas as pd
import pickle
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from google.colab import drive
drive.mount('/content/drive')

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Pre-Processing Code

In [2]:
# Emoji Processing
empos = re.compile(r"(❤)|(♥)|(💕)|(🤍)|(💯)|(🎉)|(👍)|(👌)|(👏)|(😍)|(😘)|(😊)|(😂)|(🤣)|(😁)|(😉)|(🤩)|(🤗)|(🙂)|(😆)|(🤪)|(😀)")
emneg = re.compile(r"(😭)|(😩)|(😒)|(😔)|(😡)|(😴)|(🔫)|(😞)|(😪)|(😫)|(🥲)|(💀)|(😟)|(😢)|(😑)|(🙄)|(😤)")
emnet = re.compile(r"(☯)|(✨)|(★)|(█)|(🔥)|(♫)|(©)|(👀)|(🐓)|(🍀)|(😳)|(🙏🏼)|(😬)|(😎)|(👋)|(🐵)|(🐒)|(🐴)|(💪)")
def emoji(txt):
    txt = empos.sub(" positif ", txt)
    txt = emneg.sub(" negatif ", txt)
    txt = emnet.sub(" netral ", txt)
    return txt

# Remove Unnecessary Character
mark1 = re.compile(r"(?:\@|https?\://)\S+") # regex hapus link, https, mention @
mark2 = re.compile(r"[0-9]|[^\w\s]|_") # regex hapus angka, simbol
def remuch(txt):
    txt = mark1.sub(" ", txt)
    txt = mark2.sub(" ", txt)
    return txt

# Text Normalization
def load(filename):
    f = open(filename,"r")
    dic = {}
    for line in f:
        a,b = line.split(" = ")
        dic[a]= b[:-1]
    f.close()
    return dic

abfile = load("/content/drive/MyDrive/Skripsi/Library/abbreviations_rev2.txt")
def norm(text):
    word = text.split()
    l = len(word)
    for i in range(l):
        if word[i] in abfile:
            word[i]=abfile[word[i]]
    return " ".join(word)

# Stopwords Removal
def hilang_repetisi(text):
    kata=text.split()
    l=len(kata)
    for i in range(l):
        S=kata[i]
        ll=len(S)
        j=0
        k=1
        R=S[0]
        while k<ll:
            if S[j]!=S[k]:
                R+=S[k]
                j=k
            k+=1
        kata[i]=R
    return " ".join(kata)

def load(filename):
    f=open(filename,"r")
    ret=set()
    for line in f:
      ret.add(hilang_repetisi(line[:-1]))
    f.close
    return ret

stoplist=load("/content/drive/MyDrive/Skripsi/Library/stopwords_uci.txt")
def stopwords(text):
    words=text.split()
    l=len(words)
    for i in range(l):
      if words[i] in stoplist:
        words[i]=""
    return " ".join(words)

# Stemming
stem = StemmerFactory()
stemmer = stem.create_stemmer()

def stem(text):
    words = text.split()
    words = ([stemmer.stem(w) for w in words])
    return " ".join(words)

In [9]:
# Preprocessing
def prep(text):
    text = text.lower() # Case Folding
    text = emoji(text) # Emoji Processing
    text = remuch(text) # Remove Unnecessary Character
    text = norm(text) # Text Normalization
    text = stopwords(text) # Stopwords Removal
    text = stem(text) # Stemming
    return " ".join(text.split())

# text = '❤😒✨ 2 jam merasakan #Nuansa Alam di Punti-Kayu yg asri @jokofamily https://skola.com'
text = "Wisata yg mengasyikkan dan sejuk ditengah kota, kalau mau kesini jangan lupa bawa lotion anti nyamuk ya gaes, penjual makanan dan snack udah ada jd gk perlu repot2 nenteng dari rumah, hanya saja tiketnya agak mahal saat weekend, dan banyak wahana yg gk dipelihara oleh petugasnya 😢 …"
text = prep(text)
print(text)

wisata asyik sejuk tengah kota mau kesini lupa bawa lotion anti nyamuk ya gaes jual makan snack jadi tidak repot nenteng rumah tiket agak mahal saat weekend wahana tidak pelihara tugas negatif


In [10]:
# Membaca file CSV
df = pd.read_csv('/content/drive/My Drive/Skripsi/Dataset/raw-dataset.csv')
# df.head()

df["text"] = df['text'].apply(lambda x: prep(x))
# Print the preprocessed text
print(df["text"].head())

0    letsguide wisata alam ubah terang benderang ka...
1                  nuansa alam dalam kota metropolitan
2    enak kumpul santa bareng keluarga nikmat panor...
3    hutan kota lestari tidak gerogot lahan netral ...
4                                              positif
Name: text, dtype: object


In [None]:
# Create pickle data label aspects
X = open('/content/drive/MyDrive/Skripsi/Library/text.pkl', 'wb')
pickle.dump(df['text'], X) #new_df['text'] (ulasan) didump ke word
X.close()

y = df.iloc[:,1:7]
y.columns = range(y.shape[1])
label = open('/content/drive/MyDrive/Skripsi/Library/label_data.pkl','wb')
pickle.dump(y, label)
label.close()

In [11]:
pd.set_option('display.max_rows', None)
df["text"]

0       letsguide wisata alam ubah terang benderang ka...
1                     nuansa alam dalam kota metropolitan
2       enak kumpul santa bareng keluarga nikmat panor...
3       hutan kota lestari tidak gerogot lahan netral ...
4                                                 positif
5       november instagrammmebel bayank spot poto seju...
6       dewasa anak anak mobil sayang pandemi tidak li...
7       wahana masuk tiap wahana mesti pakai tiket tik...
8       bangun tiru lambang negara piramida unta asal ...
9                                            wahana bagus
10                                  lapang tenis olahraga
11      miniatur dunia logo logo kenal dunia menara ef...
12                                         monyet wkwkwwk
13      potensi jadi bagus kebun raya bogor jalan keci...
14                                          tempat renang
15                                           waterboomnya
16      punti kayu sangat senang kurang awat farm zone...
17            