<a href="https://colab.research.google.com/github/nauraift/TextPreprocessing/blob/main/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install Sastrawi

In [None]:
pip install swifter

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

#**PREPROCESSING**

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import tweepy as tweet
import string
import re 
import nltk
import matplotlib.pyplot as plt

TWEET_DATA = pd.read_csv("/content/drive/MyDrive/Skripsi/data_penelitian/copy_of_data.csv")

TWEET_DATA['tweet'] = TWEET_DATA['tweet']



print(TWEET_DATA['tweet'].head(10))

###**Cleansing**

In [None]:
def remove_tweet_special(text):
  #remove tab, new line, ans back slice
  text = text.replace('\\t'," ").replace('\\n'," ").replace('u\\'," ").replace('\\'," ")
  # remove non ASCII (emoticon, chinese word, .etc)
  text = text.encode('ascii', 'replace').decode('ascii')
  # remove mention, link, hashtag
  text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
  # remove incomplete URL
  return text.replace("http://", " ").replace("https://", " ")
                
TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_number)

#remove punctuation(tanda baca)
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['cleansing'] = TWEET_DATA['tweet'].apply(remove_singl_char)

print('Cleansing Result : \n') 
print(TWEET_DATA['cleansing'].head())

###**Case Folding**

In [None]:
TWEET_DATA['casefold'] = TWEET_DATA['cleansing'].str.lower()

print('Case Folding Result : \n')
print(TWEET_DATA['casefold'].head(5))

###**Tokenizing**

In [None]:
from nltk.tokenize import word_tokenize

# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

TWEET_DATA['tokens'] = TWEET_DATA['casefold'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
TWEET_DATA['tokens'].head()


###**Document Frequency**

In [None]:
def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

df = calc_DF(TWEET_DATA["tokens"])
df


In [None]:
df = pd.DataFrame.from_dict([df.keys(),df.values()]).transpose()
df.columns= ['word', 'frekuensi']
df.sort_values(['frekuensi'],ignore_index=True, inplace=True)
df

df.to_csv("documentfrequency.csv")

###**Stopword Removal**

In [None]:
from nltk.corpus import stopwords

# get stopword indonesia

list_stopwords = stopwords.words('indonesian')
print(list_stopwords)

In [None]:
list_stopwords.extend(["gue", "loe"])

txt_stopword = pd.read_csv("stopwordlist_bismillah.txt", names= ["stopwordlist_bismillah"], header = None)
                  
list_stopwords.extend(txt_stopword["stopwordlist_bismillah"])

list_stopwords = set(list_stopwords)

list_stopwords


In [None]:
#remove stopword pada list 
def stopwords_removal(text):
    return [text for text in text if text not in list_stopwords]

TWEET_DATA['stopword'] = TWEET_DATA['tokens'].apply(stopwords_removal) 

TWEET_DATA['stopword']

###**Stemming**

In [None]:
import swifter
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()
# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}
for document in TWEET_DATA['stopword']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
print(term_dict)
print("------------------------")

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

TWEET_DATA['stemmed'] = TWEET_DATA['stopword'].swifter.apply(get_stemmed_term)
TWEET_DATA['stemmed']