In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

In [6]:
data1 = pd.read_excel('dataset\SentimentIbu_kota_pindah_new.xlsx')
data2 = pd.read_excel('dataset\SentimentIPrmindahan_bu_Kota_Jakarta.xlsx') 

data = pd.concat([data1,data2])
data = data['text']
data.head()

0    @detikcom Maksudnya secara gak langsung mau bi...
1    @san_ozil saya turut mendukung proses pemindah...
2    Kolaborasi menjadi salah satu kunci bagi pemer...
3    Sch! GUYS AKU KETERIMA DI ILKOM UNPAD 😭😭😭😭😭\n\...
4    @abangbelneg ayo kawal proses pemindahan Ibuko...
Name: text, dtype: object

In [7]:
data = data.astype('string')


In [8]:
data.info()

<class 'pandas.core.series.Series'>
Index: 5883 entries, 0 to 1494
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
5883 non-null   string
dtypes: string(1)
memory usage: 91.9 KB


In [10]:
# remove duplicate 
data = data.drop_duplicates()

In [11]:
data.info()

<class 'pandas.core.series.Series'>
Index: 2099 entries, 0 to 7
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
2099 non-null   string
dtypes: string(1)
memory usage: 97.3 KB


# preprocessing

### cleaning + casefolding

In [25]:
# casefolding + cleaning 
# function 
def clean_tweet(teks):
    return ''.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+\/\/\S+)","",teks))
def remove_number(text):
    return  re.sub(r"\d+", "", text)

#use function
clean = []
for i in data:
    clean.append(clean_tweet(i))

In [26]:
clean_dat = pd.DataFrame(clean, columns=['teks'])
clean_dat['teks'] = clean_dat['teks'].str.lower()
clean_dat['teks'] = clean_dat['teks'].apply(remove_number)

### tokenizing

In [37]:
#function
def word_tokenize_wrapper(text):
    return word_tokenize(text)

In [41]:
clean_dat['teks'] = clean_dat['teks'].apply(word_tokenize_wrapper)



In [42]:
clean_dat.head()

Unnamed: 0,teks
0,"[maksudnya, secara, gak, langsung, mau, bikin,..."
1,"[ozil, saya, turut, mendukung, proses, peminda..."
2,"[kolaborasi, menjadi, salah, satu, kunci, bagi..."
3,"[sch, guys, aku, keterima, di, ilkom, unpad, t..."
4,"[ayo, kawal, proses, pemindahan, ibukota, ibuk..."


### stopword removal

In [2]:
list_stopword = stopwords.words('indonesian')
sastrawi_stopwords = StopWordRemoverFactory().get_stop_words()

# bila ada kata yang mau di masukan stopword secara manual 
# list_stopword.extend(['...'])    

txt_stopword = pd.read_csv("id.stopwords.02.01.2016.txt", names= ["stopwords"], header = None)

# menambah stopword berdasarkan kamus 
list_stopword.extend(txt_stopword["stopwords"][0].split(' '))
list_stopword.extend(sastrawi_stopwords)

In [51]:
list_stopword = set(list_stopword)

# function
def stopwords_removal(words):
    return [word for word in words if word not in list_stopword]

clean_dat['teks'] = clean_dat['teks'].apply(stopwords_removal)

In [52]:
clean_dat.head()

Unnamed: 0,teks
0,"[maksudnya, gak, langsung, bikin, pembenaran, ..."
1,"[ozil, mendukung, proses, pemindahan, ibukota,..."
2,"[kolaborasi, salah, kunci, pemerataan, investa..."
3,"[sch, guys, keterima, ilkom, unpad, trivia, st..."
4,"[ayo, kawal, proses, pemindahan, ibukota, ibuk..."


### normalisasi

In [64]:
norm_dat = pd.read_csv('kamus_normalisasi.csv',delimiter=',')
norm_dat

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1,wow,elongasi,0,0
1,aminn,amin,1,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0,Birthday yg keberpa kak?,abreviasi,0,0
...,...,...,...,...,...,...,...
15001,gataunya,enggak taunya,0,Ini kaya nenek2 ya beb gataunya agnezz @yugime...,akronim,0,0
15002,gtau,enggak tau,0,Stidaknya mrka may berkarya Dan berusaha yg tr...,akronim,abreviasi,0
15003,gatau,enggak tau,0,Ih gatau malu,akronim,0,0
15004,fans2,fan-fan,0,Jkt48 adalah tempat di mana sesama fans saling...,reduplikasi,naturalisasi,0


In [65]:
norm_dat = norm_dat.drop(['context','In-dictionary','category1','category2','category3'],axis=1)

In [66]:
normalizad_word_dict = {}

for index, row in norm_dat.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

  if row[0] not in normalizad_word_dict:
  normalizad_word_dict[row[0]] = row[1]


In [73]:
# function
def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

clean_dat['teks'] = clean_dat['teks'].apply(normalized_term)

### Stemming

In [85]:
clean_dat

Unnamed: 0,teks
0,"[maksudnya, enggak, langsung, bikin, pembenara..."
1,"[ozil, mendukung, proses, pemindahan, ibukota,..."
2,"[kolaborasi, salah, kunci, pemerataan, investa..."
3,"[sch, guys, keterima, ilkom, unpad, trivia, st..."
4,"[ayo, kawal, proses, pemindahan, ibukota, ibuk..."
...,...
7977,"[asn, pindah, kota, negara, nusantara, menpan,..."
7978,"[mbaknya, memang, nikah, umur, nikah, nikahnya..."
7979,"[yang, atasin, banjir, pas, presiden, tuh, sap..."
7980,"[mendukung, proses, pemindahan, ibukota, negar..."


In [70]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stem test
sentence = 'Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan'
output   = stemmer.stem(sentence)

print(output)

ekonomi indonesia sedang dalam tumbuh yang bangga


In [86]:
import swifter
# funciton 
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in clean_dat['teks']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])

# fucntion for dataframe 
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

clean_dat['teks'] = clean_dat['teks'].swifter.apply(get_stemmed_term)

7279
------------------------
maksudnya : maksud
enggak : enggak
langsung : langsung
bikin : bikin
pembenaran : benar
pemindahan : pindah
ibukota : ibukota
ikn : ikn
dengan : dengan
halus : halus
acara : acara
macet : macet
dll : dll
basi : basi
tong : tong
kalo : kalo
sidah : sidah
presiden : presiden
gampang : gampang
atasi : atas
tahun : tahun
lho : lho
lagu : lagu
kau : kau
putar : putar
ozil : ozil
mendukung : dukung
proses : proses
negara : negara
pindahibukota : pindahibukota
kolaborasi : kolaborasi
salah : salah
kunci : kunci
pemerataan : perata
investasi : investasi
indonesia : indonesia
upaya : upaya
pemerintah : perintah
mewujudkan : wujud
citacita : citacita
menggenjot : genjot
investor : investor
daerah : daerah
sch : sch
guys : guys
keterima : terima
ilkom : ilkom
unpad : unpad
trivia : trivia
stasiun : stasiun
kak : kak
bandung : bandung
dirancang : rancang
persiapan : siap
jakarta : jakarta
httpstcomtekdspb : httpstcomtekdspb
ayo : ayo
kawal : kawal
ibukotanegara : ibuk

Pandas Apply: 100%|██████████| 7982/7982 [00:00<00:00, 310191.18it/s]


In [87]:
clean_dat

Unnamed: 0,teks
0,"[maksud, enggak, langsung, bikin, benar, pinda..."
1,"[ozil, dukung, proses, pindah, ibukota, negara..."
2,"[kolaborasi, salah, kunci, perata, investasi, ..."
3,"[sch, guys, terima, ilkom, unpad, trivia, stas..."
4,"[ayo, kawal, proses, pindah, ibukota, ibukotan..."
...,...
7977,"[asn, pindah, kota, negara, nusantara, menpan,..."
7978,"[mbak, memang, nikah, umur, nikah, nikah, yaud..."
7979,"[yang, atasin, banjir, pas, presiden, tuh, sap..."
7980,"[dukung, proses, pindah, ibukota, negara, pind..."
