In [19]:
import numpy as np
import pandas as pd

In [20]:
data_path = 'data/'

### text preprocessing

In [31]:
from pythainlp.util import normalize
import re
urlR = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
hashtagR = r"#[\u0E00-\u0E7Fa-zA-Z]"
symbols = "#$%'*./:;<=>@\^_`|~"
def text_preprocessing(text):
    text = re.sub(urlR, ' ', text)
    text = re.sub(hashtagR, '', text)
    text = text.translate(str.maketrans(" ", " ", symbols))
    text = normalize(text)
    return text.strip()

In [33]:
from sklearn.model_selection import train_test_split

data = pd.read_csv(data_path + 'label.csv')
data.drop_duplicates(subset=['text'], inplace=True)
data.drop(data[data['label']== 2].index, inplace=True)
data['label'].value_counts()

1.0    12449
0.0     7923
Name: label, dtype: int64

In [34]:
data['text'] = data['text'].apply(text_preprocessing)

### sample data

In [38]:
polite_data = data[data['label']== 1]
impolite_data = data[data['label']== 0]

sample_polite = polite_data.sample(n=impolite_data.shape[0])

In [39]:
impolite_data.shape, sample_polite.shape

((7923, 2), (7923, 2))

### Split data

In [40]:
data = pd.concat([impolite_data, sample_polite])

In [41]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(data, test_size=0.4, random_state=42, stratify=data['label'])
val, test = train_test_split(val, test_size=0.5, random_state=42, stratify=val['label'])

In [42]:
train.label.value_counts()

1.0    4754
0.0    4753
Name: label, dtype: int64

In [43]:
val.label.value_counts()

0.0    1585
1.0    1584
Name: label, dtype: int64

In [44]:
test.label.value_counts()

1.0    1585
0.0    1585
Name: label, dtype: int64

In [45]:
train.to_csv(data_path + 'train.csv', index=False)
val.to_csv(data_path + 'val.csv', index=False)
test.to_csv(data_path + 'test.csv', index=False)