In [None]:
pip install ekphrasis

In [None]:
pip install sastrawi

In [None]:
import re
import string
import pandas as pd
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [None]:
df = pd.read_excel('D:\\tweets_dataframe_cleaned.xlsx')

In [None]:
df

In [None]:
df.head()

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag"},
    # fix HTML tokens
    fix_html=True,
    # corpus from which the word statistics are going to be used for word segmentation
    segmenter="twitter",
    # corpus from which the word statistics are going to be used for spell correction
    corrector="twitter",
    # perform word segmentation on hashtags
    unpack_hashtags=True,
    # Unpack contractions (can't -> can not)
    unpack_contractions=True,
    # spell correction for elongated words
    spell_correct_elong=False,
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    # list of dictionaries for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)


In [None]:
def bersih_data(text):
    return " ".join(text_processor.pre_process_doc(text))

def non_ascii(text):
    return text.encode('ascii', 'replace').decode('ascii')

def remove_space_zulfikar(text):
    return " ".join(text.split())

def remove_emoji_zulfikar(text):
    return ' '.join(re.sub("([x#][A-Za-z0-9]+)"," ", text).split())

def remove_tab(text):
    return text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")

def remove_tab2(text):
    return re.sub('\s+',' ',text)

def remove_rt(text):
    return text.replace('RT'," ")

def remove_mention(text):
    return ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())

def remove_incomplete_url(text):
    return text.replace("http://", " ").replace("https://", " ")

def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

def remove_excessive_dot(text):
    return text.replace('..'," ")

def change_stripe(text):
    return text.replace('-'," ")

def lower(text):
    return text.lower()

def remove_whitespace_LT(text):
    return text.strip()

def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

def remove_punctuation(text):
    remove = string.punctuation
    remove = remove.replace("_", "")  # don't remove hyphens
    pattern = r"[{}]".format(remove)  # create the pattern
    return re.sub(pattern, "", text)


In [None]:
def remove_number_eks(text):
    return text.replace('<number>'," ")

def remove_angka(text):
    return re.sub(r"\d+", "", text)

def remove_URL_eks(text):
    return text.replace('URL'," ").replace('url'," ")

def space_punctuation(text):
    return re.sub('(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )', r' ', text)


In [None]:
print(df.columns)


In [None]:
# Misalkan Anda ingin mengganti nama kolom 'Tweet' menjadi 'Tweets'
df.rename(columns={'tweets': 'Tweet'}, inplace=True)


In [None]:
print(df.columns)


In [None]:
i = 0
final_string = []

for text in df['Tweet'].values:
    filteredSentence = []
    EachReviewText = ""
    proc = remove_rt(text)
    proc = lower(proc)
    proc = change_stripe(proc)
    proc = remove_emoji_zulfikar(proc)
    proc = remove_tab(proc)
    proc = remove_tab2(proc)
    proc = non_ascii(proc)
    proc = remove_incomplete_url(proc)
    proc = remove_excessive_dot(proc)
    proc = remove_whitespace_LT(proc)
    proc = remove_whitespace_multiple(proc)
    proc = remove_single_char(proc)
    proc = space_punctuation(proc)
    proc = remove_punctuation(proc)
    proc = remove_space_zulfikar(proc)
    proc = bersih_data(proc)
    proc = remove_number_eks(proc)
    proc = remove_angka(proc)
    proc = remove_URL_eks(proc)
    EachReviewText = proc
    final_string.append(EachReviewText)


In [None]:
df["step01"] = final_string

In [None]:
df.head(10)

In [None]:
df_delete = df[df['step01'].str.contains(" ")]

In [None]:
df_new = df[df.isin(df_delete)].dropna()


In [None]:
df_new.info()

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
def word_tokenize_wrappper(text):
    return word_tokenize(text)

In [None]:
df_new['tokens'] = df['step01'].apply(word_tokenize_wrappper)

In [None]:
df_new.head(10)

In [None]:
normalized_word = pd.read_csv('kamus_pilpres_normalisasi.csv', sep=",")
normalized_word_dict = {}

for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1]

def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

In [None]:
df_new['final_tokens'] = df_new['tokens'].apply(normalized_term)

In [None]:
i = 0
final_string_tokens = []
for text in df_new['final_tokens'].values:
    EachReviewText = ""
    EachReviewText = ' '.join(text)
    final_string_tokens.append(EachReviewText)

In [None]:
df_new['step02'] = final_string_tokens

In [None]:
df_new["step02"] = final_string_tokens

In [None]:
df_new.head(10)

In [None]:
df_new.to_csv('clean_dataset_01.csv', sep=",")
import os
os.system('start "csv" "clean_dataset_01.csv"')


## Stemming


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
factory = StopWordRemoverFactory()
stop_words = ['sih', 'nya', 'rt', 'loh', 'lah', 'dd', 'mah', 'nye', 'eh', 'ehh', 'ah', 'yang']
data = factory.get_stop_words()+stop_words
stopwords_sastrawi = factory.create_stop_word_remover()


In [None]:
df_new['step02'] = df_new['step02'].apply(str)

In [None]:
df_new.head()

In [None]:
i = 0
final_string = []
s = ""
for sentence in df_new["step02"].values:
    filteredSentence = []
    EachReviewText = ""
    st = stopwords_sastrawi.remove(sentence)
    s = (stemmer.stem(st))
    filteredSentence.append(s)
    EachReviewText = ' '.join(filteredSentence)
    final_string.append(EachReviewText)

In [None]:
df_new.loc[:,('ProcessedText')] = final_string

In [None]:
df_new.head()

In [None]:
df_new.to_csv('clean_dataset_02.csv',sep=";")