In [1]:
import numpy as np
import pandas as pd
import re
import string
from stop_words import get_stop_words
stop_words = get_stop_words('indonesian')

In [2]:
df = pd.read_csv("dataset.csv")
df.head()
df = df.dropna()

In [3]:
key_norm = pd.read_csv('key_norm.csv')

In [4]:
def preprocess_text(s):
    s = s.strip()
    s = str.lower(s)

    # URL separator
    s = s.replace('.', ' ')
    
    # menghapus simbol
    s = s.translate(str.maketrans('', '', string.punctuation)) 
    
    # mengganti angka dengan string kosong
    s = ' '.join(re.sub(r"[0-9]+", '', word) for word in s.split())

    # mengubah setiap kata menjadi bentuk kata bakunya
    s = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0] if (key_norm["singkat"]==word).any() else word for word in s.split()])
    s = str.lower(s)
    
    # menghapus pengulangan kata
    s = s.replace('-', ' ')
    
    # menghilangkan kata dengan panjang 1
    s = ' '.join(word for word in s.split() if len(word) >= 2)
    
    # menghilangkan stopwords
    s = ' '.join(word for word in s.split() if word not in stop_words)
    
    return s

In [5]:
import time

t1 = time.time()
df.loc[:,"Teks"] = df.Teks.apply(lambda s: preprocess_text(s))
print(df.shape)
df = df.drop(df[df.Teks == ""].index)
print(df.shape)
df = df[['Teks', 'label']]
print('cleaning dataset elapsed time:', time.time() - t1)
df.head()

(7273, 2)
(7256, 2)
cleaning dataset elapsed time: 77.33943581581116


Unnamed: 0,Teks,label
0,dek baikan,0
1,ikuti seminar inspiratif cara mudah sukses bis...,1
2,terpilih pemenang cek rupiah juta kuota flash ...,1
3,punya masalah keuangan cukup jaminkan bpkb mob...,1
4,diskn tgel jkerbet bit lysngroy pasaran terbai...,1


In [6]:
df['label'] = df['label'].astype('int64')
df.dtypes

Teks     object
label     int64
dtype: object

### Save clean_dataset_without_stemming

In [7]:
df.to_csv("clean_dataset.csv", index=False)

### Stemming

In [8]:
import time
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
t1 = time.time()
stemmer = StemmerFactory().create_stemmer()
df['Teks'] = df.apply(lambda row: stemmer.stem(row['Teks']), axis=1)
print('Stemming is done in', time.time() - t1)
print()

Stemming is done in 1188.726644039154



### Save clean_dataset_with_stemming

In [9]:
df.to_csv("clean_dataset_with_stemming.csv", index=False)

In [10]:
df2 = pd.read_csv('clean_dataset.csv')

In [11]:
df2.dtypes

Teks     object
label     int64
dtype: object

In [12]:
import time
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
t1 = time.time()
stemmer = StemmerFactory().create_stemmer()
stemmer.stem('1390129 safsafasf     afoasfjpasojf poafjaspo jasopfj aspo  \n safoasfasok fsaofjsaofjasofja aspjaopjasfopasjfpoasj pojop asjfpo ajsfopa jsapof jsapofja spofsajf posajfpo asjfpoas jfopsa fjsapof japsofj aspofjaso pfjaspofjs aposajpo saj sjaposj poasjf poasjfsopa jaspo jpoj opj apoj poaj aposjpo ajpo jaopj opaj poaop ajpo ajopaj poaj poaj poajop aso')

'1390129 safsafasf afoasfjpasojf poafjaspo jasopfj aspo safoasfasok fsaofjsaofjasofja aspjaopjasfopasjfpoasj pojop asjfpo ajsfopa jsapof jsapofja spofsajf posajfpo asjfpoas jfopsa fjsapof japsofj aspofjaso pfjaspofjs aposajpo saj sjaposj poasjf poasjfsopa jaspo jpoj opj apoj poaj aposjpo ajpo jaopj opaj poaop ajpo ajopaj poaj poaj poajop aso'