In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from glob import glob
import numpy as np

In [4]:
files = glob('*bert-xlnet.csv')
dfs = []
for f in files:
    df = pd.read_csv(f)
    dfs.append(df)

In [8]:
df = pd.concat(dfs)
df.shape

(208816, 3)

In [9]:
df.head()

Unnamed: 0,text,label,prob
0,@LionelModric10 Sabah sabah yorma donkey,Neutral,0.999414
1,Continue to reach out . . SEKUT in aja laah Na...,Neutral,0.994295
2,ada suprise untuk #danishnaufal_14 tq pada pem...,Positive,0.999538
3,aku kerja keras gila jimat jimat nak beli apa ...,Positive,0.999405
4,@farhanalv ajak makan ah ke penang bistro wkwkw,Neutral,0.999462


In [10]:
import re
from unidecode import unidecode
from malaya.text.rules import normalized_chars

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def transformer_textcleaning(string, space_after_punct=False):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    string = ' '.join(string)
    if space_after_punct:
        string = re.sub(f'([{PUNCTUATION}])', r' \1 ', string)
        string = re.sub('\s{2,}', ' ', string)
    return string

transformer_textcleaning('hello')

'hello'

In [12]:
from tqdm import tqdm

mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}

texts, labels = [], []
for i in tqdm(range(len(df))):
    text = df['text'].iloc[i]
    t = transformer_textcleaning(text)
    if len(t):
        texts.append(t)
        labels.append(mapping[df['label'].iloc[i]])

100%|██████████| 208816/208816 [00:12<00:00, 17171.95it/s]


In [13]:
len(texts)

208815

In [14]:
list(zip(texts[:10], labels[:10]))

[('Sabah sabah yorma donkey', 1),
 ('Continue to reach out . . SEKUT in aja laah Nantikan persekutuan duniawi bersama Salam jempol kejepit',
  1),
 ('ada suprise untuk #danishnaufal_14 tq pada pemperi alhamdulillah rezeki dikelilingi org yg baik2 .. x kenal mak',
  2),
 ('aku kerja keras gila jimat jimat nak beli apa apa and finally ada duit untuk buat bilik sendiri',
  2),
 ('ajak makan ah ke penang bistro wkwkw', 1),
 ('Aku 100% faham apa yg dirasai. Jangan give up bro, sangka baik dengan ujian yang Allah bagi ni. Niat',
  2),
 ('Selain emas yang terdapat di Kabupaten Bombana, Sulawesi Tenggara memiliki enam wilayah krja migas. Arsjad Rasjid.',
  1),
 ('punch card pun nak tangkap gambar ke', 1),
 ('Baru tengok semalam. Best!!!!', 2),
 ('Iya kak', 1)]

In [19]:
train_X, test_X, train_Y, test_Y = train_test_split(texts, labels, test_size = 0.2)
len(train_X), len(test_X)

(167052, 41763)

In [20]:
np.unique(train_Y, return_counts = True)

(array([0, 1, 2]), array([61906, 67629, 37517]))

In [21]:
np.unique(test_Y, return_counts = True)

(array([0, 1, 2]), array([15361, 17002,  9400]))

In [23]:
import json

with open('train-set.json', 'w') as fopen:
    json.dump({'train_X': train_X, 'train_Y': train_Y}, fopen)
    
with open('test-set.json', 'w') as fopen:
    json.dump({'test_X': test_X, 'test_Y': test_Y}, fopen)