In [1]:
# import library
import numpy as np 
import pandas as pd
import re

from functools import partial

In [2]:
# open dataset,save on dataframe
df = pd.read_csv("dataset.csv",encoding="ISO-8859-1")

# membuka data slang (data yang akan dibenarkan)
with open('slang.txt') as file:
    slang_map = dict(map(str.strip, line.partition('\t')[::2])
    for line in file if line.strip())
    
df.head()

Unnamed: 0,created_at,screen_name,text,Label
0,5/1/2021 16:48,theoholan,itu mah konflik rebutan tanah,0
1,5/2/2021 1:38,knb_rmdhna,Karena ga ada urgensinya. Ga ada ancaman inva...,0
2,5/2/2021 5:51,_afaz,ini valid ð¯ kemarin saya smpat nonton sala...,0
3,5/2/2021 7:19,PolitikBRIN,Analisis yang dirumuskan dari Klaster Politik...,0
4,5/3/2021 14:12,KoloElang,Bisa tidak? Palestina taat secara kaffah dala...,0


In [3]:
# drop column created_at and screen_name
df = df.drop(['created_at', 'screen_name'], axis=1)

In [4]:
# Pre-Processing dataset
# fungsi case folding dan menghapus RT dan CC
def casefoldingRemoveRt(text):
     # menghapus RT (reetwet)
    text = re.sub(r'^[RT]+', '', text) 
    # case folding (menjadi lower case)
    text = text.lower()
    # menghapus CC (carbon copy)
    text = re.sub(r'^[cc]+', '', text)
    return text

# fungsi untuk menghapus noise
def removeNoise(text): 
    # menghapus unicode
    text = re.sub(r'(\\u[0-9A-Fa-f]+)','', text)
    # menghapus emoticon
    text = re.sub(r'[^\x00-\x7f]','',text)
    # menghapus url
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',text) 
    # menghapus # (biasanya utk menandai topik tertentu)
    text = re.sub(r'#([^\s]+)', '', text) 
    # menghapus "@users"
    text = re.sub('@[^\s]+','',text)
    return text

# mengganti kata2 slang menjadi lebih baku
slang_words = sorted(slang_map, key=len, reverse=True) # longest first for regex
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])

def removePunctuation(text): 
    # menghapus integers/numbers
    text = ''.join([i for i in text if not i.isdigit()]) 
    # khusus untuk tanda baca ' (dirapatkan dengan text)
    text = re.sub("'",'',text)
    # menghapus punctuation (tanda baca)
    text = re.sub(r"[^A-Za-z]+"," ",text)
    # menghapus 1 karakter
    text = re.sub('(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', '', text)
    # menghapus whitespace
    text = ' '.join(text.split())
    return text

In [5]:
df['text'] = df['text'].apply(lambda x: casefoldingRemoveRt(x))
df['text'] = df['text'].apply(lambda x: removeNoise(x))
df['text'] = df['text'].apply(lambda x: replaceSlang(x))
df['text'] = df['text'].apply(lambda x: removePunctuation(x))

In [6]:
df.head()

Unnamed: 0,text,Label
0,itu mah konflik rebutan tanah,0
1,karena tidak ada urgensinya tidak ada ancaman ...,0
2,ini sah kemarin saya smpat nonton salah satu w...,0
3,analisis yang dirumuskan dari klaster politik ...,0
4,bisa tidak palestina taat secara kaffah dalam ...,0


In [8]:
# delete missing value 
nan_value = float('NaN')
df.replace('', nan_value, inplace=True)
df.dropna(subset=['text'], inplace=True)

In [11]:
# delete data duplicate
df.drop_duplicates(subset='text', keep = 'first', inplace = True)

In [12]:
# convert to CSV
df.to_csv('clean1.csv', encoding='utf-8', index=False)