# Text Preprocessing

## Import library yang dibutuhkan

In [None]:
print("Detecting environment: ", end=' ')
try:
    import google.colab
    IN_COLAB = True
    print("Running the code in Google Colab. Installing and downloading dependencies.\nPlease wait...")
    import nltk
    !wget https://raw.githubusercontent.com/taudata-indonesia/eLearning/master/lib/taudataNlpTm.py
    !mkdir data
    !wget -P data/ https://raw.githubusercontent.com/taudata-indonesia/eLearning/master/data/slang.txt
    !wget -P data/ https://raw.githubusercontent.com/taudata-indonesia/eLearning/master/data/stopwords_id.txt
    !wget -P data/ https://raw.githubusercontent.com/taudata-indonesia/eLearning/master/data/stopwords_en.txt
    !wget -P data/ https://raw.githubusercontent.com/taudata-indonesia/eLearning/master/data/corpus_sederhana.txt
    !pip install unidecode textblob sastrawi
    nltk.download('popular')
except:
    IN_COLAB = False
    print("Running the code locally.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
import string
from unidecode import unidecode
from html import unescape
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
nltk.download('popular')

## Load stopwords

In [None]:
# Loading Stopwords: Ada beberapa cara
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
factory = StopWordRemoverFactory()

NLTK_StopWords = stopwords.words('indonesian')
Sastrawi_StopWords_id = factory.get_stop_words()

df=open('data/stopwords_id.txt',"r",encoding="utf-8", errors='replace')
id_stop = df.readlines()
df.close()
id_stop = [t.strip().lower() for t in id_stop]

with open('more_stopwords.txt', 'r') as f:
    kamus = f.readline().split(',')
    more_stopwords = []
    for x in kamus:
        text = re.sub('[\s]+', '', x)
        more_stopwords.append(text)

with open('exclude_stopwords.txt', 'r') as f:
    kamus = f.readline().split(',')
    exclude_stopwords = []
    for x in kamus:
        text = re.sub('[\s]+', '', x)
        exclude_stopwords.append(text)

N = 10
print(NLTK_StopWords[:N])
print(Sastrawi_StopWords_id[:N])
print(id_stop[:N])
#print(more_stopwords[:N])
print(len(Sastrawi_StopWords_id), len(id_stop), len(NLTK_StopWords), len(more_stopwords))
print(type(Sastrawi_StopWords_id), type(id_stop), type(NLTK_StopWords), type(more_stopwords))

#Stopword
stops = set(Sastrawi_StopWords_id + id_stop + NLTK_StopWords + more_stopwords)
#exclude = ["tidak", 'benar', 'betul', 'baik', 'belum', 'boleh', 'dekat', 'guna', 'mampu', 'masalah', 'nggak', 'pasti', 'penting', 'percuma', 'rasa', 'satu', 'tegas', 'tunjuk', 'yakin', 'usah']
exclude = exclude_stopwords
for i in exclude:
  stops.discard(i)
print(len(stops))

## Load slang words

In [None]:
# sumber github
indo_slang_words = pd.read_csv("https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv")
indo_slang_words = indo_slang_words.iloc[:,:2]
print('jumlah slang words: ', len(indo_slang_words))
# sumber tau data
df=open('data/slang.txt',"r",encoding="utf-8", errors='replace')
slangS = df.readlines(); df.close()
slangS = [t.strip('\n').strip() for t in slangS] # remove enter
slangS = [t.split(":") for t in slangS] # split based on ':'
slangS = [[k.strip(), v.strip()] for k,v in slangS] # remove white space
slangS = np.array(slangS) # convert to numpy
slang = slangS[:,0]
formal = slangS[:,1]
more_slang_words = pd.DataFrame(slangS[:,0], columns=['slang'])
more_slang_words['formal'] = formal
print('jumlah slang words: ', len(more_slang_words))
# combine slang words
all_slang_words = pd.concat([indo_slang_words, more_slang_words])
all_slang_words.drop_duplicates(subset='slang', keep='first', inplace=True)
print('jumlah slang words: ', len(all_slang_words))
slang_words = all_slang_words.set_index('slang').T.to_dict(orient='records')
dict_slang_words = slang_words[0]
print('kata normal dari kata nggak yaitu: ', dict_slang_words['ga'])

## Fungsi text preprocessing

In [None]:
#Emoji
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

def hashtags(text):
  """Menyimpan hashtags"""
  getHashtags = re.compile(r"#(\w+)")
  pisahtags = re.compile(r'[\w][^A-Z]*')
  tagS = re.findall(getHashtags, text)
  for tag in tagS:
      proper_words = ' '.join(re.findall(pisahtags, tag))
      text = text.replace('#'+tag,proper_words)
  return text

def removeConsecutiveDuplicates(text):
    if len(text) < 2:
        return text
    if text[0] != text[1]:
        return text[0]+removeConsecutiveDuplicates(text[1:])
    return removeConsecutiveDuplicates(text[1:])

def normalization(text):
  T = TextBlob(text).words
  for i,t in enumerate(T):
      if t in dict_slang_words.keys():
          T[i] = dict_slang_words[t]
  return ' '.join(T)

def datePreproc(date):
    date = re.sub('(T[\w:+]*)', '', str(date))
    return date

urlPattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def caseFolding(text):
    #Convert to lower case
    text = text.lower()
    return text

def cleansing1(text):
    #Get hashtags
    #text = hashtags(text)
    #Remove hashtags
    text = re.sub('#(\w+)', ' ', text)
    #Remove enter
    text = re.sub('(\n)', ' ', text)
    #Representasi ASCII terdekat
    text = unidecode(text)
    #Clean html entity
    text = unescape(text)
    #Remove emoji
    text = emoji_pattern.sub(r'', text)
    #Remove email
    text = re.sub('[\w._%+-]+@[\w\.-]+\.[a-zA-Z]{2,4}', ' ', text)
    #Remove @username
    text = re.sub('@[\w]*', ' ', text)
    #Remove Website URLS
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)
    #Remove additional white spaces
    text = re.sub('[\s]+', ' ', text)
    return text

def cleansing2(text):
    #Remove punctuation
    text = re.sub(r'[^\w\s]|_', ' ', text)
    #Remove number
    text = re.sub('[\d]+', ' ', text)
    #Remove double character
    text = removeConsecutiveDuplicates(text) #https://www.geeksforgeeks.org/remove-consecutive-duplicates-string/
    #Remove additional white spaces
    text = re.sub('[\s]+', ' ', text)
    return text

def stopwordRemover(word_tokens):
    '''input berupa kalimat yang sudah ditoken (from nltk.tokenize import word_tokenize)'''
    word_no_stopwords = [w for w in word_tokens if not w in stops]
    word_no_stopwords = " ".join(word_no_stopwords)
    return word_no_stopwords

def preprocess(text):
    stemmer = StemmerFactory().create_stemmer()
    text = str(text)
    text = cleansing1(text) 
    lower_text = caseFolding(text) 
    formal_text = normalization(lower_text) 
    clean_text = cleansing2(formal_text) 
    words = word_tokenize(clean_text) 
    #text = " ".join(words)
    text = stopwordRemover(words)
    text = stemmer.stem(text)
    return text

## Load dataset

In [None]:
data_kandidat = pd.read_csv('/content/new_prabowo.csv', on_bad_lines='skip')
data_kandidat.dropna(axis=0, how='all', inplace=True)
print(data_kandidat.shape)
data_kandidat.head()

## Seleksi data

In [None]:
data = data_kandidat.drop_duplicates(subset=['username'], keep='last')
data = data.drop_duplicates(keep='last')
print(data.shape)
data.head()

## Preprocessing tanggal

In [None]:
tanggal = [x for x in data_kandidat['date']]
tanggal_general = []
for date in tanggal:
    new_date = datePreproc(date)
    tanggal_general.append(str(new_date))

data_kandidat['date'] = tanggal_general

## Preprocessing tweet

In [None]:
tweets = [x for x in data['content']]
tweet_clean = []
for i, x in enumerate(tweets):
    x_new = preprocess(x)
    print('loading for stemming tweet ke-{} dari {}'.format(i+1, len(data['content'])))
    tweet_clean.append(str(x_new))

In [None]:
data.reset_index(drop=True, inplace=True)

## Membuat dataframe tweet yang telah siap dianalisis

In [None]:
from datetime import datetime
def parser_date(date):
  new_date = datetime.strptime(date, "%Y-%m-%d")
  return new_date


data_clean = pd.DataFrame(tanggal_general, columns=['date'])
data_clean['url'] = data['url']
data_clean['username'] = data['username']
data_clean['content'] = data['content']
data_clean['cleaned'] = tweet_clean
data_clean['date'] = data_clean['date'].apply(parser_date)
data_clean

In [None]:
data_clean.dropna(subset=['cleaned'], inplace=True)
print(data_clean.duplicated().any()) 
print(data_clean.isnull().any() )
print(data_clean.head())
print(data_clean.shape)

## Menyimpan file

In [None]:
data_clean.to_csv('tweet_clean_prabowo.csv', encoding='utf8', index=False)