In [1]:
import numpy as np
import pandas as pd
import re
from stop_words import get_stop_words
stop_words = get_stop_words('indonesian')

In [2]:
df = pd.read_csv("dataset.csv")
df.head()
df = df.dropna()

In [3]:
def preprocess_text(s):
    s = str.lower(s)

    # removing symbols
    s = ' '.join(re.findall('[\w]+',s))
    
    # replace numbers with empty string
    s = ' '.join(re.sub(r"[0-9]+", '', word) for word in s.split())

    # removing one-character word
    s = ' '.join(word for word in s.split() if len(word) >= 2)
    
    # removing stop words
    s = ' '.join(word for word in s.split() if word not in stop_words)
    
    return s

In [4]:
df.loc[:,"Teks"] = df.Teks.apply(lambda s: preprocess_text(s))
print(df.shape)
df = df.drop(df[df.Teks == ""].index)
print(df.shape)
df = df[['Teks', 'label']]
df.head()

(4605, 2)
(4601, 2)


Unnamed: 0,Teks,label
0,gimana dek dah baikan blm,0.0
1,ikuti seminar inspiratif cara mudah sukses bis...,1.0
2,terpilih sbgi pemenang cek rp jt kuota flash i...,1.0
3,punya masalah keuangan cukup jaminkan bpkb mob...,1.0
4,diskn tgel jkerbet bit ly sngroy pasaran terba...,1.0


In [5]:
df['label'] = df['label'].astype('int64')
df.dtypes

Teks     object
label     int64
dtype: object

### Save clean_dataset_without_stemming

In [6]:
df.to_csv("clean_dataset.csv", index=False)

### Stemming

In [7]:
import time
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
t1 = time.time()
stemmer = StemmerFactory().create_stemmer()
df['Teks'] = df.apply(lambda row: stemmer.stem(row['Teks']), axis=1)
print('Stemming is done in', time.time() - t1)
print()

Stemming is done in 671.5675990581512



### Save clean_dataset_with_stemming

In [8]:
df.to_csv("clean_dataset_with_stemming.csv", index=False)

In [9]:
df2 = pd.read_csv('clean_dataset.csv')

In [10]:
df2.dtypes

Teks     object
label     int64
dtype: object

In [11]:
import time
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
t1 = time.time()
stemmer = StemmerFactory().create_stemmer()
stemmer.stem('1390129 safsafasf     afoasfjpasojf poafjaspo jasopfj aspo  \n safoasfasok fsaofjsaofjasofja aspjaopjasfopasjfpoasj pojop asjfpo ajsfopa jsapof jsapofja spofsajf posajfpo asjfpoas jfopsa fjsapof japsofj aspofjaso pfjaspofjs aposajpo saj sjaposj poasjf poasjfsopa jaspo jpoj opj apoj poaj aposjpo ajpo jaopj opaj poaop ajpo ajopaj poaj poaj poajop aso')

'1390129 safsafasf afoasfjpasojf poafjaspo jasopfj aspo safoasfasok fsaofjsaofjasofja aspjaopjasfopasjfpoasj pojop asjfpo ajsfopa jsapof jsapofja spofsajf posajfpo asjfpoas jfopsa fjsapof japsofj aspofjaso pfjaspofjs aposajpo saj sjaposj poasjf poasjfsopa jaspo jpoj opj apoj poaj aposjpo ajpo jaopj opaj poaop ajpo ajopaj poaj poaj poajop aso'