# Data Preprocessing

In [13]:
import pandas as pd
import nltk
import re
import swifter

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [28]:
target_dataset = 'tweets.csv'
raw_data_path = f'../dataset/raw/raw_{target_dataset}'
normalization_list_path = '../dataset/wordlist/normalization_list.csv'
output_path = f'../dataset/processed/processed_{target_dataset}'

In [15]:
df = pd.read_csv(raw_data_path, delimiter='`')

print(df.shape)

(7502, 27)


In [16]:
df = df[['content']]

print(df.shape)
df.head()

(7502, 1)


Unnamed: 0,content
0,@soeyoto1 @msaid_didu Bukti kalau Pemerintahan...
1,"@KompasTV Mentri esdm, trus patokanmu apa? Kan..."
2,@hnurwahid Ngaku masih subsidi tapi ada swasta...
3,@Tan_Mar3M Kalau Vivo bisaberani bertahan deng...
4,Menteri BUMN Erick Thohir menegaskan persiapan...


# Section 1: Data cleaning

In [17]:
# Functions to clean data

def case_folding(text):
	return text.lower()

def clean_tweet(tweet):
	# Remove @mentions
	tweet = re.sub(r'@\S+', ' ', tweet)
	# Remove URLs
	tweet = re.sub(r'https?://[A-Za-z0-9./]+', ' ', tweet)
	# Remove RT
	tweet = re.sub(r'RT : ', ' ', tweet)
	# Remove punctuation
	tweet = re.sub(r'[^\w\s]', ' ', tweet)
	# Remove numbers
	tweet = re.sub(r'[0-9]', ' ', tweet)
	# Remove whitespace
	tweet = re.sub(r'\s+', ' ', tweet)
	# Remove leading and trailing whitespace
	tweet = tweet.strip()
	# Remove non-ASCII characters
	tweet = tweet.encode('ascii', 'ignore').decode('ascii')
	# Keep tweet with more than 2 characters
	tweet = ' '.join([w for w in tweet.split() if len(w) > 2])
	return tweet

def tokenize(tweet):
	return nltk.word_tokenize(tweet)

normalization_list = pd.read_csv(normalization_list_path, delimiter=',')
list_normalize_targets = list(normalization_list['target'])
list_normalize_replacements = list(normalization_list['replacement'])
print(f"Normalization list: {len(list_normalize_targets)}")

def normalize(tokens):
	return [list_normalize_replacements[list_normalize_targets.index(token)] if token in list_normalize_targets else token for token in tokens]

indonesian_stopwords_list = nltk.corpus.stopwords.words('indonesian')
english_stopwords_list = nltk.corpus.stopwords.words('english')
custom_stopwords_list = ['mun','aya','ooo','nre','dur','tir','sih','aji','akun','akuuu','akuuuu','al','ala','alah',
												'bruh','gokkkssss','kop','p','y','my','nih','in','r','hnw','lt','wkwkwk','sulawesi','alsut',
												'alvien','andesten','anonymous','anonim','nya','blt','barcodenya','tyyy','this','lyfe','wkwkw',
												'petronas','mas','bang','kak','eeeaaaa','sep','cokkk','wkwkwwk','and','indramayu','hayoo',
												'hihi','rdp','neg','cc','ber','ybs','juta','ribu','dir','triliun','any','anything','anyway'
												'kom','miliar','tralalatrililiun','piis','thd','trilyunan','aok','aos','aowkwk','apwkwowkwokw',
												'uud','non','lho','lha','lalo','kakak','dik','adik','adk','kitaji','halo','hai','²','½kali','aaaahhh',
												'aaahh','aaahhh','aah','aatu','abad','abah','abdur','abrik','abrol','abt','adick','adi','adik','adol','aff','aga',
												'aha','ahahah','ahahahaa','ahshshskaks','akn','akp','akr']

list_stopwords = indonesian_stopwords_list + english_stopwords_list + custom_stopwords_list
print(f"Indonesian stopwords: {len(indonesian_stopwords_list)}")
print(f"English stopwords: {len(english_stopwords_list)}")
print(f"Custom stopwords: {len(custom_stopwords_list)}")
print(f"Total stopwords: {len(list_stopwords)}")

def remove_stopwords(tokens):
	return [token for token in tokens if token not in list_stopwords]

Normalization list: 1034
Indonesian stopwords: 758
English stopwords: 179
Custom stopwords: 110
Total stopwords: 1047


## Section 1.1: Case folding, data cleaning and tokenization data

In [18]:
df['tokens'] = df['content'].swifter.apply(case_folding)
df['tokens'] = df['tokens'].swifter.apply(clean_tweet)
df['tokens'] = df['tokens'].swifter.apply(tokenize)

print(df.shape)
df.head()

Pandas Apply:   0%|          | 0/7502 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/7502 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/7502 [00:00<?, ?it/s]

(7502, 2)


Unnamed: 0,content,tokens
0,@soeyoto1 @msaid_didu Bukti kalau Pemerintahan...,"[bukti, kalau, pemerintahan, pak, tidak, pro, ..."
1,"@KompasTV Mentri esdm, trus patokanmu apa? Kan...","[mentri, esdm, trus, patokanmu, apa, kantong, ..."
2,@hnurwahid Ngaku masih subsidi tapi ada swasta...,"[ngaku, masih, subsidi, tapi, ada, swasta, jua..."
3,@Tan_Mar3M Kalau Vivo bisaberani bertahan deng...,"[kalau, vivo, bisaberani, bertahan, dengan, ha..."
4,Menteri BUMN Erick Thohir menegaskan persiapan...,"[menteri, bumn, erick, thohir, menegaskan, per..."


## Section 1.2: Normalization and Stopwords removal

In [19]:
df['tokens'] = df['tokens'].swifter.apply(normalize)
df['tokens'] = df['tokens'].swifter.apply(remove_stopwords)

print(df.shape)
df.head()

Pandas Apply:   0%|          | 0/7502 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/7502 [00:00<?, ?it/s]

(7502, 2)


Unnamed: 0,content,tokens
0,@soeyoto1 @msaid_didu Bukti kalau Pemerintahan...,"[bukti, pemerintahan, pro, rakyat, pro, oligar..."
1,"@KompasTV Mentri esdm, trus patokanmu apa? Kan...","[menteri, esdm, patokanmu, kantong, pertamina,..."
2,@hnurwahid Ngaku masih subsidi tapi ada swasta...,"[mengaku, subsidi, swasta, jual, murah, subsid..."
3,@Tan_Mar3M Kalau Vivo bisaberani bertahan deng...,"[vivo, berani, bertahan, harga, nasib, pertamina]"
4,Menteri BUMN Erick Thohir menegaskan persiapan...,"[menteri, bumn, erick, thohir, persiapan, indo..."


## Section 1.3: Remove Duplicate

In [20]:
print(f"Before: {df.shape}")

# Remove duplicate data
df.drop_duplicates(subset='tokens', keep="first", inplace=True)

print(f"After: {df.shape}")

Before: (7502, 2)
After: (7104, 2)


# Section 1: Data Stemming

In [21]:
# Functions to stem data
# It will take more times to process (~20 minutes)

stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

terms_dict = {}
for tokens in df['tokens']:
	for token in tokens:
		if token not in terms_dict:
			terms_dict[token] = ''

print(f"Unique terms: {len(terms_dict)}")

for i, term in enumerate(terms_dict):
	terms_dict[term] = stemmer.stem(f'{term}')
	if i % 1000 == 0:
		print(f"On processing... {i} terms have been stemmed")

Unique terms: 11991
On processing... 0 terms have been stemmed
On processing... 1000 terms have been stemmed
On processing... 2000 terms have been stemmed
On processing... 3000 terms have been stemmed
On processing... 4000 terms have been stemmed
On processing... 5000 terms have been stemmed
On processing... 6000 terms have been stemmed
On processing... 7000 terms have been stemmed
On processing... 8000 terms have been stemmed
On processing... 9000 terms have been stemmed
On processing... 10000 terms have been stemmed
On processing... 11000 terms have been stemmed


In [23]:
def apply_stemmed_tokens(tokens):
	return [terms_dict[token] for token in tokens]

df['tokens_ready'] = df['tokens'].swifter.apply(apply_stemmed_tokens)

print(df.shape)
df.head()

Pandas Apply:   0%|          | 0/7104 [00:00<?, ?it/s]

(7104, 3)


Unnamed: 0,content,tokens,tokens_ready
0,@soeyoto1 @msaid_didu Bukti kalau Pemerintahan...,"[bukti, pemerintahan, pro, rakyat, pro, oligar...","[bukti, perintah, pro, rakyat, pro, oligarki, ..."
1,"@KompasTV Mentri esdm, trus patokanmu apa? Kan...","[menteri, esdm, patokanmu, kantong, pertamina,...","[menteri, esdm, patok, kantong, pertamina, jeb..."
2,@hnurwahid Ngaku masih subsidi tapi ada swasta...,"[mengaku, subsidi, swasta, jual, murah, subsid...","[aku, subsidi, swasta, jual, murah, subsidi, p..."
3,@Tan_Mar3M Kalau Vivo bisaberani bertahan deng...,"[vivo, berani, bertahan, harga, nasib, pertamina]","[vivo, berani, tahan, harga, nasib, pertamina]"
4,Menteri BUMN Erick Thohir menegaskan persiapan...,"[menteri, bumn, erick, thohir, persiapan, indo...","[menteri, bumn, erick, thohir, siap, indonesia..."


In [27]:
df.to_csv(output_path, index=False, header=True, sep=';')