# Data Preprocessing

- akan melakukan preprocessing corpus hasil scraping website halodoc menggunakan stopwords indonesia dan stemming dengan sastrawi
- untuk fine-tuning model indoBERT

## Import and Initializing

In [20]:
# import
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# load stopwords
stop_words = set(stopwords.words('indonesian'))

# load stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()



In [21]:
# load dataset
df = pd.read_csv('scraping/halodoc1-1000.csv')
print(df.shape)

# check dataset
df.head()

(9986, 5)


Unnamed: 0,url,title,title_length,content,content_count
0,https://www.halodoc.com/ketahui-segala-hal-men...,Ketahui Segala Hal Mengenai COVID-19,38,“Infeksi COVID-19 adalah salah satu penyakit y...,6532
1,https://www.halodoc.com/bagaimana-cara-merawat...,Bagaimana Cara Merawat Karies Gigi?,37,“Ada banyak pilihan perawatan medis dan rumaha...,5993
2,https://www.halodoc.com/cegah-burnout-ini-tand...,"Cegah Burnout, Ini Tanda Pekerja Butuh Piknik...",61,“Burnout merupakan kondisi ketika seseorang m...,4982
3,https://www.halodoc.com/kenalan-dengan-terapi-...,Kenalan dengan Terapi ABA untuk Anak Autis,44,“Salah satu cara untuk mengurangi gejala auti...,4855
4,https://www.halodoc.com/10-herbal-untuk-mereda...,10 Herbal untuk Meredakan Sakit Perut Bagian ...,52,“Ada banyak herbal yang dapat membantu mereda...,3638


In [22]:
# check null values
df.isnull().sum()

url              0
title            0
title_length     0
content          0
content_count    0
dtype: int64

In [23]:
# reassign 'title_length' and 'content_length' in df dengan panjang dari 'title' dan 'content' yang baru (word count)
df['title_length'] = df['title'].apply(lambda x: len(x.split()))
df['content_count'] = df['content'].apply(lambda x: len(x.split()))

df.head()

Unnamed: 0,url,title,title_length,content,content_count
0,https://www.halodoc.com/ketahui-segala-hal-men...,Ketahui Segala Hal Mengenai COVID-19,5,“Infeksi COVID-19 adalah salah satu penyakit y...,872
1,https://www.halodoc.com/bagaimana-cara-merawat...,Bagaimana Cara Merawat Karies Gigi?,5,“Ada banyak pilihan perawatan medis dan rumaha...,808
2,https://www.halodoc.com/cegah-burnout-ini-tand...,"Cegah Burnout, Ini Tanda Pekerja Butuh Piknik...",9,“Burnout merupakan kondisi ketika seseorang m...,689
3,https://www.halodoc.com/kenalan-dengan-terapi-...,Kenalan dengan Terapi ABA untuk Anak Autis,7,“Salah satu cara untuk mengurangi gejala auti...,649
4,https://www.halodoc.com/10-herbal-untuk-mereda...,10 Herbal untuk Meredakan Sakit Perut Bagian ...,8,“Ada banyak herbal yang dapat membantu mereda...,512


## Data Preprocessing

In [24]:
# check duplicates
print(df.duplicated().sum())
print(f"Ukuran dataset sebelum preprocessing: {df.shape}")

# drop duplicates
df.drop_duplicates(inplace=True)
print(f"Ukuran dataset setelah preprocessing: {df.shape}")

5
Ukuran dataset sebelum preprocessing: (9986, 5)
Ukuran dataset setelah preprocessing: (9981, 5)


In [25]:
# # save to csv
# df.to_csv('halodoc1-1000_preprocessed.csv', index=False)

### Remove unnecessary string in 'content'

In [26]:
# tbd
# tbd
# tbd
# tbd
# tbd
# tbd
# tbd
# tbd
# tbd
# tbd
# tbd
# tbd
# tbd
# tbd

### Stopwords Removal, Stemming, etc. (probably wont be used)

In [27]:
# # remove numbers that are not attached to words
# def remove_numbers(text):
#     text = re.sub(r'(?<!\S)\d+(?!\S)', '', text)
#     return text

# df['title'] = df['title'].apply(remove_numbers)
# df['content'] = df['content'].apply(remove_numbers)
# print('hapus angka beridiri sendiri selesai')

# df.head()

In [28]:
# # remove punctuation
# def remove_punctuation(text):
#     text = re.sub(r'[^\w\s]', '', text)
#     return text

# df['title'] = df['title'].apply(remove_punctuation)
# df['content'] = df['content'].apply(remove_punctuation)
# print('hapus tanda baca selesai')

# df.head()

# # remove numbers
# def remove_numbers(text):
#     text = re.sub(r'\d+', '', text)
#     return text

# df['title'] = df['title'].apply(remove_numbers)
# df['content'] = df['content'].apply(remove_numbers)
# print('hapus angka selesai')

# df.head()

# # stopword removal + lowercasing
# def remove_stopwords(text):
#     text = [word.lower() for word in text.split() if word.lower() not in stop_words]
#     return " ".join(text)

# df['title'] = df['title'].apply(remove_stopwords)
# df['content'] = df['content'].apply(remove_stopwords)
# print('stopword removal dan lowercase selesai')

# df.head()

# # stemming
# def stemming(text):
#     text = [stemmer.stem(word) for word in text.split()]
#     return " ".join(text)

# df['title'] = df['title'].apply(stemming)
# df['content'] = df['content'].apply(stemming)
# print('stemming selesai')

# df.head()

In [29]:
# # Use Sentence-BERT to get sentence embeddings (feature extraction)

# # import library
# from sentence_transformers import SentenceTransformer

# # load model SBERT (pretrained)
# model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# # get sentence embeddings
# title_embeddings = model.encode(df['title'].values)
# content_embeddings = model.encode(df['content'].values)

# # check shape
# print(title_embeddings.shape)
# print(content_embeddings.shape)

# # check embeddings
# print(title_embeddings)
# print(content_embeddings)

# # check shape



In [30]:
# # save embeddings
# np.save('title_embeddings.npy', title_embeddings)
# np.save('content_embeddings.npy', content_embeddings)

# # load embeddings
# title_embeddings = np.load('title_embeddings.npy')
# content_embeddings = np.load('content_embeddings.npy')

In [31]:
# # use content and title embeddings as dataset for transfer learning indoBERT (information retrieval)

# # create new dataframe
# df_new = pd.DataFrame()

# # add title and content embeddings to df_new
# df_new['title_embeddings'] = title_embeddings.tolist()
# df_new['content_embeddings'] = content_embeddings.tolist()

# # add title and content to df_new
# df_new['title'] = df['title']
# df_new['content'] = df['content']

# # add title_length and content_length to df_new
# df_new['title_length'] = df['title_length']
# df_new['content_length'] = df['content_count']

# # check df_new
# df_new.head()

In [32]:
# # fine-tuning indoBERT (information retrieval)

# # import library
# import torch
# from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# # check device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

In [33]:
# # load dataset
# title_embeddings = torch.tensor(np.array(df_new['title_embeddings'].values.tolist()))
# content_embeddings = torch.tensor(np.array(df_new['content_embeddings'].values.tolist()))
# title_length = torch.tensor(df_new['title_length'].values)
# content_length = torch.tensor(df_new['content_length'].values)

# # create TensorDataset
# dataset = TensorDataset(title_embeddings, content_embeddings, title_length, content_length)
# dataset.tensors


In [34]:
# # create DataLoader
# dataloader = DataLoader(dataset, batch_size=32)

# # check dataloader
# next(iter(dataloader))

# # import library for information retrieval
# from transformers import BertForSequenceClassification, AdamW, BertConfig
# from transformers import get_linear_schedule_with_warmup

# # load model
# model = BertForSequenceClassification.from_pretrained(
#     'indobenchmark/indobert-base-p1',
#     num_labels = 2,
#     output_attentions = False,
#     output_hidden_states = False
# )

# # check model
# model


In [35]:
# # check model parameters
# params = list(model.named_parameters())
# print(f'Model BERT memiliki sebanyak {len(params)} parameter yang berbeda.\n')

# print('==== Embedding Layer ====\n')
# for p in params[0:5]:
#     print(f'{p[0]} memiliki ukuran {tuple(p[1].size())} dan requires_grad={p[1].requires_grad}')

# print('\n==== First Transformer ====\n')
# for p in params[5:21]: 
#     print(f'{p[0]} memiliki ukuran {tuple(p[1].size())} dan requires_grad={p[1].requires_grad}')

# print('\n==== Output Layer ====\n')
# for p in params[-4:]:
#     print(f'{p[0]} memiliki ukuran {tuple(p[1].size())} dan requires_grad={p[1].requires_grad}')

## Tokenization

In [36]:
# # tokenization df to list
# title = df['title'].tolist()
# content = df['content'].tolist()

# (title[:5], content[:5])

In [37]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
# Load pre-trained model & tokenizer
model_name = 'indobenchmark/indobert-base-p2'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Downloading (…)solve/main/vocab.txt: 100%|██████████| 229k/229k [00:00<00:00, 4.58MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 113kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 2.00/2.00 [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.53k/1.53k [00:00<00:00, 813kB/s]
Downloading pytorch_model.bin: 100%|██████████| 498M/498M [01:23<00:00, 5.99MB/s] 


In [39]:
# Tokenize 'content'
content_tokens = []
max_sequence_length = 512

for sentence in df['content']:
    tokens = tokenizer.tokenize(sentence)
    if len(tokens) > max_sequence_length - 2:
        tokens = tokens[:max_sequence_length - 2]
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    content_tokens.extend(tokens)

# save content_tokens
with open('content_tokens.txt', 'w') as f:
    for item in content_tokens:
        f.write("%s\n" % item)