In [1]:
import pandas as pd

# Preprocessing

In [2]:
import re
from tqdm import tqdm
from nlp_id.tokenizer import Tokenizer
from nlp_id.lemmatizer import Lemmatizer 
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

In [3]:
# Import kamus bahasa slang
slang_dict = pd.read_csv('https://raw.githubusercontent.com/muhammadariffaizin/sistem-temu-kembali-informasi/master/list/slang.txt', delimiter = ";")
dict_slang = dict(slang_dict.values)
print (slang_dict)

          &                                          dan
0         +                                       tambah
1         /                                         atau
2     22nya                                   dua-duanya
3        3m  mencuci tangan memakai masker menjaga jarak
4       7an                                       tujuan
...     ...                                          ...
5593  yyaaa                                           ya
5594      z                                         saja
5595     za                                         saja
5596   zama                                        zaman
5597   zonk                                        bodoh

[5598 rows x 2 columns]


In [4]:
bungs = pd.read_csv("https://raw.githubusercontent.com/muhammadariffaizin/sistem-temu-kembali-informasi/master/data/bungs.csv")
print(bungs.shape)

catatanali07 = pd.read_csv("https://raw.githubusercontent.com/muhammadariffaizin/sistem-temu-kembali-informasi/master/data/catatanali07.csv")
print(catatanali07.shape)

  exec(code_obj, self.user_global_ns, self.user_ns)


(67950, 17)
(1209, 17)


In [5]:
lemmatizer = Lemmatizer()
tokenizer = Tokenizer()
stop_factory = StopWordRemoverFactory()

data_stopword = stop_factory.get_stop_words()

In [6]:
def preprocess(s):
    s = re.sub(r'(\\x[0-9a-fA-Z]{2})', '', s)
  #Remove URL
    s = re.sub(r'https?://\S+|www\.\S+', " ", s)
  #Remove Mentions
    s = re.sub(r'@\w+',' ',s)
  #Remove Punctuation
    s= re.sub(r'[^\w\s\d]',' ',s)
  #Remove Digits
    s = re.sub(r'\d+', ' ', s)
  #Remove HTML tags
    s = re.sub('r<.*?>',' ', s)
  #Remove Hastags
    s = re.sub(r'#\w+', ' ', s)
    s = re.sub(r'[^a-zA-Z]', ' ', s) # remove symbol dan angka
    hasil=[]
    word_token = tokenizer.tokenize(s) #tokenisasi
    for word in word_token:
        word = word.strip().lower()   #case folding
        if word in dict_slang:
            word = dict_slang[word]
        if len(word) > 3:
          #word = lemmatizer.lemmatize(word) #lemmatization
          hasil.append(word)
        else:
            continue
    result_sentence = " ".join(hasil).strip() #penggabungan kata hasil pre prosesing
    #print(result_sentence)
    return result_sentence

In [7]:
data_raw_bungs = bungs
data_raw_catatanali07 = catatanali07

#Melakukan Proses Pre Prosesing pada Kolom text Dataset 
result_preprocess_bungs = []
for i, row in tqdm(data_raw_bungs.iterrows(), total=data_raw_bungs.shape[0]):
  result_preprocess_bungs.append(preprocess(row['Tweet Text']))

#Melakukan Proses Pre Prosesing pada Kolom text Dataset 
result_preprocess_catatanali07 = []
for i, row in tqdm(data_raw_catatanali07.iterrows(), total=data_raw_catatanali07.shape[0]):
  result_preprocess_catatanali07.append(preprocess(row['Tweet Text']))

100%|██████████| 67950/67950 [00:23<00:00, 2836.19it/s]
100%|██████████| 1209/1209 [00:00<00:00, 3099.86it/s]


In [8]:
data_raw_bungs['text_clean'] = result_preprocess_bungs
data_raw_catatanali07['text_clean'] = result_preprocess_catatanali07

print("Bungs : ", data_raw_bungs.shape)
print("Catatanali07 : ", data_raw_catatanali07.shape)
data_raw_bungs = data_raw_bungs[(data_raw_bungs['text_clean'].str.split(" ").str.len() > 2)]
data_raw_catatanali07 = data_raw_catatanali07[(data_raw_catatanali07['text_clean'].str.split(" ").str.len() > 2)]

print("Bungs Preprocessed : ", data_raw_bungs.shape)
print("Catatanali07 Preprocessed : ", data_raw_catatanali07.shape)

data_raw_bungs.to_csv("bungs_preprocessed.csv")
data_raw_catatanali07.to_csv("catatanali07_preprocessed.csv")

Bungs :  (67950, 18)
Catatanali07 :  (1209, 18)
Bungs Preprocessed :  (48569, 18)
Catatanali07 Preprocessed :  (734, 18)
