In [3]:
import re
import nltk
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Load the Indonesian stopwords
stopwords = nltk.corpus.stopwords.words('indonesian')

# Load the Porter stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Define a function to preprocess the text
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
     # Remove URLs
    text = re.sub(r'http\S+', '', str(text))
    
    # Remove usernames
    text = re.sub(r'@\w+', '', str(text))
    
    # Remove hashtags
    text = re.sub(r'#\w+', '', str(text))
    
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', str(text))
    
    # Remove emoticons
    text = re.sub(r':\)|;\)|:-\)|\(-:|:\'\(|:\(|:-\(|:O|8\)|:\*|>:O|:\-\/|\^\^|:D|:P|:S|:\|', '', str(text))
    
    #Remove number
    text = re.sub(r'\d+', '', str(text))
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', str(text)).strip()
    
    # Remove stopwords
    text = [word for word in text.split() if word not in stopwords]

    # Stem the words
    text = [stemmer.stem(word) for word in text]
    
    # Detokenize the tokens to form text
    text = ' '.join(text)
    
    return text

### Test data cleaning

In [4]:
# Load the CSV dataset
df = pd.read_csv('test.csv')

# Create a new column for the preprocessed text
df['preprocessed_text'] = ''

# Loop through the rows of the dataset
for i in range(len(df)):
  # Preprocess the text in the current row
  df['preprocessed_text'][i] = preprocess_text(df['text'][i])

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preprocessed_text'][i] = preprocess_text(df['text'][i])


Unnamed: 0,text,label,preprocessed_text
0,titisan air mata kini kekeringan,0,titis air mata kering
1,tni polri serta satpol pp dki menyambangi loka...,0,tni polri satpol pp dki sambang lokasi pagi pa...
2,"Gempa Mag:5.2, 31-Jan-21 16:28:55 WIB, Lok:5....",1,gempa mag jan wib lok ls bt km baratlaut kepar...
3,berbagai klaster penyebaran covid 19 kita moni...,0,klaster sebar covid monitor ketat kerja migran...
4,saya tetap optimistis perekonomian indonesia a...,0,optimistis ekonomi indonesia kuartal iii kerja...
...,...,...,...
1810,Volume kubah lava di sektor barat daya sebesar...,1,volume kubah lava sektor barat daya m laju tum...
1811,kelar wfh udah minta ganti tuh laptop,0,kelar wfh udah ganti tuh laptop
1812,waktu gempa di bali baru baru ini sebelumnya a...,0,gempa bal jg ngerasa kepala tuh muncul gempa d...
1813,Angin Puting Beliung Rusak Puluhan Rumah di Ma...,1,angin puting beliung rusak puluh rumah makassar


In [5]:
df = df.drop_duplicates()
df

Unnamed: 0,text,label,preprocessed_text
0,titisan air mata kini kekeringan,0,titis air mata kering
1,tni polri serta satpol pp dki menyambangi loka...,0,tni polri satpol pp dki sambang lokasi pagi pa...
2,"Gempa Mag:5.2, 31-Jan-21 16:28:55 WIB, Lok:5....",1,gempa mag jan wib lok ls bt km baratlaut kepar...
3,berbagai klaster penyebaran covid 19 kita moni...,0,klaster sebar covid monitor ketat kerja migran...
4,saya tetap optimistis perekonomian indonesia a...,0,optimistis ekonomi indonesia kuartal iii kerja...
...,...,...,...
1810,Volume kubah lava di sektor barat daya sebesar...,1,volume kubah lava sektor barat daya m laju tum...
1811,kelar wfh udah minta ganti tuh laptop,0,kelar wfh udah ganti tuh laptop
1812,waktu gempa di bali baru baru ini sebelumnya a...,0,gempa bal jg ngerasa kepala tuh muncul gempa d...
1813,Angin Puting Beliung Rusak Puluhan Rumah di Ma...,1,angin puting beliung rusak puluh rumah makassar


In [6]:
# Save the updated dataset
df.to_csv('preprocessed_test.csv', index=False)

### Train data cleaning

In [7]:
# Load the CSV dataset
df = pd.read_csv('train.csv')

# Create a new column for the preprocessed text
df['preprocessed_text'] = ''

# Loop through the rows of the dataset
for i in range(len(df)):
  # Preprocess the text in the current row
  df['preprocessed_text'][i] = preprocess_text(df['text'][i])

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preprocessed_text'][i] = preprocess_text(df['text'][i])


Unnamed: 0,text,label,preprocessed_text
0,"Gempa Mag:4.8, 05/01/2021 18:56:21 (Pusat gem...",1,gempa mag pusat gempa laut km barat daya kabku...
1,idha nama pemilik perusahaan yg bikin banjir a...,0,idha nama milik usaha yg bikin banjir video se...
2,web kritik film danieldokter tariz aryaapepe r...,0,web kritik film danieldokter tariz aryaapepe r...
3,penjelasan fpi ganti nama jadi front persatuan...,0,jelas fpi ganti nama front satu islam
4,"Gempa Mag:5.0, 26-Oct-20 15:26:37 WIB, Lok:0....",1,gempa mag oct wib lok lu bt km tenggara buolsu...
...,...,...,...
4229,"Gempa Mag:2.3, 04-Mei-21 00:57:53 WIB, Lok:3....",1,gempa mag mei wib lok ls bt pusat gempa darat ...
4230,pohon itu adalah penyeimbang alam jika pohon i...,0,pohon imbang alam pohon tebang banjir longsor ...
4231,"10. Tanggal 26 Januari 2021 pukul 23:14 WIB, a...",1,tanggal januari wib awanpanas gugur catat seis...
4232,"Gempa Mag:4.3, 27-Apr-21 17:28:58 WIB, Lok:1....",1,gempa mag apr wib lok ls bt pusat gempa laut k...


In [8]:
# Save the updated dataset
df.to_csv('preprocessed_train.csv', index=False)

### Weaklabelled data cleaning

In [9]:
# Load the CSV dataset
df = pd.read_csv('autolabelled_dataset_v.csv')

# Create a new column for the preprocessed text
df['preprocessed_text'] = ''

# Loop through the rows of the dataset
for i in range(len(df)):
  # Preprocess the text in the current row
  df['preprocessed_text'][i] = preprocess_text(df['text'][i])

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preprocessed_text'][i] = preprocess_text(df['text'][i])


Unnamed: 0,text,label,preprocessed_text
0,Bener2 ini jumat berkah dari malem ketemu male...,0,bener jumat berkah malem ketemu malem ujansemo...
1,"Bencana geologi : gempabumi, tsunami, letusan ...",0,bencana geologi gempabumi tsunami letus gunung...
2,Liverpool Tsunami cidera ini namanya,0,liverpool tsunami cidera nama
3,Kemaren pas nerjang banjir kira kira 3/4 ban m...,0,kemaren pas nerjang banjir ban motor liatin or...
4,Gempa...?,1,gempa
...,...,...,...
26210,pemda depok pas bikin berita angin puting beli...,1,pemda depok pas bikin berita angin puting beli...
26211,puting beliung apa pitung beliung?,0,puting beliung pitung beliung
26212,alias bgsttt ati lu dari apaan mba😭 kuat bgt t...,0,alias bgsttt ati lu mba kuat bgt tahan api air...
26213,Non stop gempa dari selasa. Semalem ga bisa td...,0,non stop gempa selasa semalem ga tdur nyenyak ...


In [10]:
# Save the updated dataset
df.to_csv('preprocessed_autolabelled_dataset.csv', index=False)

## Re run and check

In [11]:
adf = pd.read_csv('preprocessed_train.csv')
bdf = pd.read_csv('preprocessed_test.csv')
cdf = pd.read_csv('preprocessed_autolabelled_dataset.csv')

In [12]:
adf.head()

Unnamed: 0,text,label,preprocessed_text
0,"Gempa Mag:4.8, 05/01/2021 18:56:21 (Pusat gem...",1,gempa mag pusat gempa laut km barat daya kabku...
1,idha nama pemilik perusahaan yg bikin banjir a...,0,idha nama milik usaha yg bikin banjir video se...
2,web kritik film danieldokter tariz aryaapepe r...,0,web kritik film danieldokter tariz aryaapepe r...
3,penjelasan fpi ganti nama jadi front persatuan...,0,jelas fpi ganti nama front satu islam
4,"Gempa Mag:5.0, 26-Oct-20 15:26:37 WIB, Lok:0....",1,gempa mag oct wib lok lu bt km tenggara buolsu...


In [13]:
adf.isnull().sum()

text                 0
label                0
preprocessed_text    3
dtype: int64

In [14]:
bdf.isnull().sum()

text                 0
label                0
preprocessed_text    3
dtype: int64

In [15]:
cdf.isnull().sum()

text                 0
label                0
preprocessed_text    4
dtype: int64

In [16]:
adf = adf.dropna()

In [17]:
bdf = bdf.dropna()

In [18]:
cdf = cdf.dropna()

In [19]:
adf = adf.drop('text', axis=1)
adf = adf.rename(columns={'preprocessed_text': 'text'})
adf

Unnamed: 0,label,text
0,1,gempa mag pusat gempa laut km barat daya kabku...
1,0,idha nama milik usaha yg bikin banjir video se...
2,0,web kritik film danieldokter tariz aryaapepe r...
3,0,jelas fpi ganti nama front satu islam
4,1,gempa mag oct wib lok lu bt km tenggara buolsu...
...,...,...
4229,1,gempa mag mei wib lok ls bt pusat gempa darat ...
4230,0,pohon imbang alam pohon tebang banjir longsor ...
4231,1,tanggal januari wib awanpanas gugur catat seis...
4232,1,gempa mag apr wib lok ls bt pusat gempa laut k...


In [20]:
bdf = bdf.drop('text', axis=1)
bdf = bdf.rename(columns={'preprocessed_text': 'text'})
bdf

Unnamed: 0,label,text
0,0,titis air mata kering
1,0,tni polri satpol pp dki sambang lokasi pagi pa...
2,1,gempa mag jan wib lok ls bt km baratlaut kepar...
3,0,klaster sebar covid monitor ketat kerja migran...
4,0,optimistis ekonomi indonesia kuartal iii kerja...
...,...,...
1801,1,volume kubah lava sektor barat daya m laju tum...
1802,0,kelar wfh udah ganti tuh laptop
1803,0,gempa bal jg ngerasa kepala tuh muncul gempa d...
1804,1,angin puting beliung rusak puluh rumah makassar


In [21]:
cdf = cdf.drop('text', axis=1)
cdf = cdf.rename(columns={'preprocessed_text': 'text'})
cdf

Unnamed: 0,label,text
0,0,bener jumat berkah malem ketemu malem ujansemo...
1,0,bencana geologi gempabumi tsunami letus gunung...
2,0,liverpool tsunami cidera nama
3,0,kemaren pas nerjang banjir ban motor liatin or...
4,1,gempa
...,...,...
26210,1,pemda depok pas bikin berita angin puting beli...
26211,0,puting beliung pitung beliung
26212,0,alias bgsttt ati lu mba kuat bgt tahan api air...
26213,0,non stop gempa selasa semalem ga tdur nyenyak ...


In [22]:
adf.to_csv('preprocessed_train.csv', index=False)
bdf.to_csv('preprocessed_test.csv', index=False)
cdf.to_csv('preprocessed_autolabelled_dataset.csv', index=False)