In [None]:
import pandas as pd
import os
import json
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from utils import *

In [None]:
config_data_type = 'text_comments' # text_comments, text_only, comments_only
base_dir = './data/WEIBO-SEG/' + config_data_type
if not os.path.exists(base_dir):
    os.mkdir(base_dir)

In [None]:
raw_data = pd.read_csv('./data/weibo_raw_data.csv')
raw_data.head()

In [None]:
data = raw_data[['text_comments','label']]
data = data.rename(columns = {'text_comments':'text'})
data['label'] = LabelEncoder().fit_transform(data['label'])

In [None]:
train, tmp_val = train_test_split(data, test_size=0.1, random_state=35)
_, tmp_test = train_test_split(train, test_size=0.3, random_state=35)

test = pd.concat([tmp_val, tmp_test])

In [None]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print(len(train),len(test))

In [None]:
train_label = train['label'].values.tolist()
test_label = test['label'].values.tolist()

In [None]:
with open(base_dir + '/train_label.txt','w',encoding='utf-8') as file:
    for each_index in train_label:
        file.write(str(each_index))
        file.write('\n')

with open(base_dir + '/test_label.txt','w',encoding='utf-8') as file:
    for each_index in test_label:
        file.write(str(each_index))
        file.write('\n')

In [None]:
train_tmp = train.copy()
train_tmp['text_split'] = train['text'].apply(get_limited_split)
train = train_tmp

test_tmp = test.copy()
test_tmp['text_split'] = test['text'].apply(get_limited_split)
test = test_tmp

In [None]:
train_l = []  # Segmented Text
label_l = []  # Label of Each Text
index_l =[]   # The Index of Each Text Before Segmentation
for idx,row in train.iterrows():
  for l in row['text_split']:
    train_l.append(l)
    label_l.append(row['label'])
    index_l.append(idx)
len(train_l), len(label_l), len(index_l)

In [None]:
test_l = []
test_label_l = []
test_index_l =[]
for idx,row in test.iterrows():
  for l in row['text_split']:
    test_l.append(l)
    test_label_l.append(row['label'])
    test_index_l.append(idx)
len(test_l), len(test_label_l), len(test_index_l)

In [None]:
with open(base_dir + '/train_index_list.txt','w',encoding='utf-8') as file:
    for each_index in index_l:
        file.write(str(each_index))
        file.write('\n')

with open(base_dir + '/test_index_list.txt','w',encoding='utf-8') as file:
    for each_index in test_index_l:
        file.write(str(each_index))
        file.write('\n')

In [None]:
train_df = pd.DataFrame({'text':train_l, 'label':label_l})
train_df.head()

In [None]:
test_df = pd.DataFrame({'text':test_l, 'label':test_label_l})
test_df.head()

In [None]:
with open(base_dir + '/train.txt', 'w', encoding='utf-8') as file:
    for i in range(len(train_df)):
        file.write(str(train_df.iloc[i,1]))
        file.write(' ')
        string = train_df.iloc[i,0].replace('\n',' ').replace('[SEP]',' ').replace('\t',' ').replace('\r',' ')
        file.write(string)
        file.write('\n')

In [None]:
with open(base_dir + '/test.txt', 'w', encoding='utf-8') as file:
    for i in range(len(test_df)):
        file.write(str(test_df.iloc[i,1]))
        file.write(' ')
        string = test_df.iloc[i,0].replace('\n',' ').replace('[SEP]',' ').replace('\t',' ').replace('\r',' ')
        file.write(string)
        file.write('\n')