In [19]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [50]:
import pandas as pd

# Read the CSV file with a different encoding
df = pd.read_csv('re_dataset.csv', encoding='ISO-8859-1')

# Save the DataFrame with UTF-8 encoding
df.to_csv('re_dataset_utf8.csv', index=False, encoding='utf-8')


In [51]:
from datasets import load_dataset

# Define the path to the UTF-8 encoded CSV file
path = 're_dataset_utf8.csv'

# Load the dataset from the UTF-8 encoded CSV file
dataset = load_dataset('csv', data_files={'train': path})

# Display the dataset
print(dataset['train'].to_pandas())


Generating train split: 0 examples [00:00, ? examples/s]

                                                   Tweet  HS  Abusive  \
0      - disaat semua cowok berusaha melacak perhatia...   1        1   
1      RT USER: USER siapa yang telat ngasih tau elu?...   0        1   
2      41. Kadang aku berfikir, kenapa aku tetap perc...   0        0   
3      USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...   0        0   
4      USER USER Kaum cebong kapir udah keliatan dong...   1        1   
...                                                  ...  ..      ...   
13164  USER jangan asal ngomong ndasmu. congor lu yg ...   1        1   
13165                       USER Kasur mana enak kunyuk'   0        1   
13166  USER Hati hati bisu :( .g\n\nlagi bosan huft \...   0        0   
13167  USER USER USER USER Bom yang real mudah terdet...   0        0   
13168  USER Mana situ ngasih(": itu cuma foto ya kuti...   1        1   

       HS_Individual  HS_Group  HS_Religion  HS_Race  HS_Physical  HS_Gender  \
0                  1         0            0

#### Case Folding

In [53]:
# ------ Case Folding --------
# Perform case folding by converting all text columns to lowercase
text_cols = df.select_dtypes(include=['object']).columns
df[text_cols] = df[text_cols].apply(lambda x: x.str.lower())

print(df.head())

                                               Tweet  HS  Abusive  \
0  - disaat semua cowok berusaha melacak perhatia...   1        1   
1  rt user: user siapa yang telat ngasih tau elu?...   0        1   
2  41. kadang aku berfikir, kenapa aku tetap perc...   0        0   
3  user user aku itu aku\n\nku tau matamu sipit t...   0        0   
4  user user kaum cebong kapir udah keliatan dong...   1        1   

   HS_Individual  HS_Group  HS_Religion  HS_Race  HS_Physical  HS_Gender  \
0              1         0            0        0            0          0   
1              0         0            0        0            0          0   
2              0         0            0        0            0          0   
3              0         0            0        0            0          0   
4              0         1            1        0            0          0   

   HS_Other  HS_Weak  HS_Moderate  HS_Strong  
0         1        1            0          0  
1         0        0            0 

#### Tokenizing

In [54]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

# Define preprocessing functions
def remove_tweet_special(text):
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    text = text.encode('ascii', 'replace').decode('ascii')
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    return text.replace("http://", " ").replace("https://", " ")

def remove_number(text):
    return  re.sub(r"\d+", "", text)

def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

def remove_whitespace_LT(text):
    return text.strip()

def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

def word_tokenize_wrapper(text):
    return word_tokenize(text)

# Assuming the text column is the first column in your dataframe
text_column = df.columns[0]

# Apply preprocessing functions
df[text_column] = df[text_column].apply(remove_tweet_special)
df[text_column] = df[text_column].apply(remove_number)
df[text_column] = df[text_column].apply(remove_punctuation)
df[text_column] = df[text_column].apply(remove_whitespace_LT)
df[text_column] = df[text_column].apply(remove_whitespace_multiple)
df[text_column] = df[text_column].apply(remove_singl_char)
df['tokens'] = df[text_column].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df['tokens'].head())

Tokenizing Result : 

0    [disaat, semua, cowok, berusaha, melacak, perh...
1    [rt, user, user, siapa, yang, telat, ngasih, t...
2    [kadang, aku, berfikir, kenapa, aku, tetap, pe...
3    [user, user, aku, itu, aku, ku, tau, matamu, s...
4    [user, user, kaum, cebong, kapir, udah, keliat...
Name: tokens, dtype: object


In [55]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

df['tokens_fdist'] = df['tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(df['tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(cowok, 2), (perhatian, 2), (gue, 2), (elo, 2...
1    [(user, 2), (siapa, 2), (rt, 1), (yang, 1), (t...
2    [(aku, 6), (ketika, 3), (kadang, 2), (tuhan, 2...
3    [(aku, 3), (user, 2), (itu, 2), (ku, 1), (tau,...
4    [(user, 2), (kaum, 1), (cebong, 1), (kapir, 1)...
Name: tokens_fdist, dtype: object


#### Stopwords removal

In [56]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
# txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# # convert stopword string to list & append additional stopword
# list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df['tokens_fdist'] = df['tokens'].apply(stopwords_removal) 


print(df['tokens_fdist'].head())

0    [disaat, cowok, berusaha, melacak, perhatian, ...
1    [user, user, telat, ngasih, eluedan, sarap, gu...
2    [kadang, berfikir, percaya, tuhan, jatuh, berk...
3              [user, user, ku, matamu, sipit, diliat]
4    [user, user, kaum, cebong, kapir, udah, keliat...
Name: tokens_fdist, dtype: object


#### Normalization

In [57]:
normalizad_word = pd.read_csv("normalisasi.csv")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

df['normalized'] = df['tokens'].apply(normalized_term)

print(df['normalized'].head(10))

0    [disaat, semua, cowok, berusaha, melacak, perh...
1    [rt, user, user, siapa, yang, telat, ngasih, t...
2    [kadang, saya, berpikir , kenapa, saya, tetap,...
3    [user, user, saya, itu, saya, ku, tahu, matamu...
4    [user, user, kaum, cebong, kapir, sudah, kelia...
5    [user, ya, bani, taplak, dkk, xfxfxxxfxfxxxfxfxx]
6    [deklarasi, pilkada, aman, dan, anti, hoax, wa...
7    [saya, baru, saja, kelar, rewatch, aldnoah, ze...
8    [nah, admin, belanja, satu, lagi, port, terbai...
9            [user, enak, Sedang, kalau, smbil, ngewe]
Name: normalized, dtype: object


  if row[0] not in normalizad_word_dict:
  normalizad_word_dict[row[0]] = row[1]


#### Stemmer

In [59]:
pip install Sastrawi swifter

Defaulting to user installation because normal site-packages is not writeable
Collecting Sastrawi
  Obtaining dependency information for Sastrawi from https://files.pythonhosted.org/packages/6f/4b/bab676953da3103003730b8fcdfadbdd20f333d4add10af949dd5c51e6ed/Sastrawi-1.0.1-py2.py3-none-any.whl.metadata
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
     ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
     - -------------------------------------- 0.0/1.2 MB 1.4 MB/s eta 0:00:01
     --- ------------------------------------ 0.1/1.2 MB 2.2 MB/s eta 0:00:01
     ------------- -------------------------- 0.4/1.2 MB 3.7 MB/s eta 0:00:01
     ----------------------- ---------------- 0.7/1.2 MB 4.5 MB/s eta 0:00:01
     --------------------------------- ------ 1.0/1.2 MB 4.8 MB/s eta 0:00:01
     ---------------------------------------  1.2/1.2 MB 5.0 MB/s eta 0:00:01
     -------------------

In [60]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['tokens_stemmed'] = df['normalized'].swifter.apply(get_stemmed_term)
print(df['tokens_stemmed'])

28246
------------------------
disaat : saat
semua : semua
cowok : cowok
berusaha : usaha
melacak : lacak
perhatian : perhati
saya : saya
loe : loe
lantas : lantas
remehkan : remeh
yang : yang
kasih : kasih
khusus : khusus
ke : ke
elo : elo
basic : basic
bego : bego
rt : rt
user : user
siapa : siapa
telat : telat
ngasih : ngasih
tahu : tahu
eluedan : eluedan
sarap : sarap
bergaul : gaul
dengan : dengan
cigax : cigax
jifla : jifla
calis : cal
sama : sama
noh : noh
licew : licew
juga : juga
kadang : kadang
berpikir  : pikir
kenapa : kenapa
tetap : tetap
percaya : percaya
pada : pada
tuhan : tuhan
padahal : padahal
selalu : selalu
jatuh : jatuh
berkalikali : berkalikali
merasa : rasa
itu : itu
ninggalkan : ninggalkan
sendirian : sendiri
ketika : ketika
orangtuaku : orangtuaku
berencana : rencana
berpisah : pisah
kakakku : kakak
lebih : lebih
memilih : pilih
jadi : jadi
kristen : kristen
anak : anak
ter : ter
ku : ku
matamu : mata
sipit : sipit
tetapi : tetapi
diliat : liat
dari : dari
man

Pandas Apply:   0%|          | 0/13169 [00:00<?, ?it/s]

0        [saat, semua, cowok, usaha, lacak, perhati, sa...
1        [rt, user, user, siapa, yang, telat, ngasih, t...
2        [kadang, saya, pikir, kenapa, saya, tetap, per...
3        [user, user, saya, itu, saya, ku, tahu, mata, ...
4        [user, user, kaum, cebong, kapir, sudah, liat,...
                               ...                        
13164    [user, jangan, asal, ngomong, ndasmu, congor, ...
13165                    [user, kasur, mana, enak, kunyuk]
13166    [user, hati, hati, bisu, lagi, bosan, huft, xf...
13167    [user, user, user, user, bom, yang, real, muda...
13168    [user, mana, situ, ngasih, itu, cuma, foto, ya...
Name: tokens_stemmed, Length: 13169, dtype: object


In [61]:
df.to_csv("Text_Preprocessing_id_multilabel.csv")