In [2]:
import numpy as np
import pandas as pd

In [3]:
data_path = 'data/'

### text preprocessing

In [4]:
from pythainlp.util import normalize
import re
urlR = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
hashtagR = r"#[\u0E00-\u0E7Fa-zA-Z]"

def text_preprocessing(text):
    text = re.sub(urlR, '<unk>', text)
    text = re.sub(hashtagR, '', text)
    text = normalize(text)
    return text.strip()

In [5]:
from sklearn.model_selection import train_test_split

data = pd.read_csv(data_path + 'label.csv')
data.drop_duplicates(subset=['text'], inplace=True)
data.drop(data[data['label']== 2].index, inplace=True)
data['label'].value_counts()

1.0    12449
0.0     7923
Name: label, dtype: int64

In [6]:
data['text'] = data['text'].apply(text_preprocessing)

### sample 4000 data

In [7]:
polite_data = data[data['label']== 1]
impolite_data = data[data['label']== 0]

sample_polite = polite_data.sample(n=4000)
sample_impolite = impolite_data.sample(n=4000)

sample_polite.to_csv(data_path + 'sample_polite.csv', index=False)
sample_impolite.to_csv(data_path + 'sample_impolite.csv', index=False)

In [8]:
non_sample_impolite = impolite_data[~impolite_data.index.isin(sample_impolite.index)]
non_sample_polite = polite_data[~polite_data.index.isin(sample_impolite.index)]

In [9]:
non_sample_polite.to_csv(data_path + 'non_sample_polite.csv', index=False)
non_sample_impolite.to_csv(data_path + 'non_sample_impolite.csv', index=False)

### load data

In [10]:
polite_data = pd.read_csv(data_path + 'sample_polite.csv')
impolite_data = pd.read_csv(data_path + 'sample_impolite.csv')


data = pd.concat([polite_data, impolite_data])

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])

In [12]:
train.to_csv(data_path + 'classification_data/train.csv', index=False)
test.to_csv(data_path + 'classification_data/test.csv', index=False)