In [18]:
import pandas as pd
import json
import re
import numpy as np
from collections import defaultdict

In [2]:
# load data
df = pd.read_csv('data/cerpen.csv')
df = df.sample(frac=1).reset_index(drop=True)

In [12]:
def data_to_examples(df):
    X = []
    y = []
    for idx, row in df.iterrows():
        categories = json.loads(str(row['categories']).replace("\'", '\"'))
        text = str(row['text']).lower()
        text = re.sub('[^a-z]+', ' ', text)
        x = text.split()

        X.append(x)
        y.append(categories)
    return np.array(X), np.array(y)

In [26]:
X, Y = data_to_examples(df)

In [27]:
cat_freq = defaultdict(int)
for categories in y:
    for cat in categories:
        cat_freq[cat] += 1

In [28]:
freq = list(cat_freq.items())
freq.sort(key=lambda x: x[1], reverse=True)
freq[:11]

[('Cerpen Cinta', 3934),
 ('Cerpen Remaja', 3213),
 ('Cerpen Persahabatan', 2903),
 ('Cerpen Keluarga', 2707),
 ('Cerpen Cinta Sedih', 2151),
 ('Cerpen Anak', 2143),
 ('Cerpen Sedih', 2110),
 ('Cerpen Fantasi (Fiksi)', 1377),
 ('Cerpen Patah Hati', 1304),
 ('Cerpen Kehidupan', 1257),
 ('Cerpen Penyesalan', 1097)]

In [38]:
category_to_use = set(map(lambda x: x[0], freq[:10]))

In [46]:
df.head()
df['categories_cut'] = df['categories'].apply(lambda x: json.loads(str(x).replace("\'", '\"')))
df['categories_cut'] = df['categories_cut'].apply(lambda l: [cat for cat in l if cat in category_to_use])

In [51]:
new_df = df[df.astype('str').categories_cut != '[]'].drop('categories', axis=1).copy()

In [53]:
print(len(new_df))

16766

In [54]:
new_df.head()

Unnamed: 0,title,source,authors,text,categories_cut
0,"Walaupun 1000 Tahun, Tak Masalah",http://cerpenmu.com/cerpen-horor-hantu/walaupu...,['DRE'],Ini merupakan kisah seorang wanita. Wanita yan...,[Cerpen Fantasi (Fiksi)]
1,Glasses Make Me Random,http://cerpenmu.com/cerpen-lucu-humor/glasses-...,['Rizki Annisa'],Galau! Gila! Kemana kacamata gue. Masa iya mej...,[Cerpen Remaja]
2,Cinta Setengah Sadar,http://cerpenmu.com/cerpen-cinta-romantis/cint...,['Fadhila Nur Indah Sari'],Aku termenung menyaksikan jutaan rintik hujan ...,[Cerpen Remaja]
3,Cinta di Malam Tahun Baru,http://cerpenmu.com/cerpen-cinta/cinta-di-mala...,['Aelyta'],“Tahun baru dimana rin?” Tiba tiba suara terde...,[Cerpen Cinta]
7,"Sorry, My BFF",http://cerpenmu.com/cerpen-sedih/sorry-my-bff....,['Callula Zia Aqila'],“Daniyyah!! Ayo siap siap berangkat sekolah na...,"[Cerpen Anak, Cerpen Persahabatan, Cerpen Sedih]"


In [70]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(new_df, test_size=0.3, random_state=322)

In [71]:
train_df = train_df.reset_index().drop('index', axis=1)
test_df = test_df.reset_index().drop('index', axis=1)

In [74]:
train_df.to_csv('data/cerpen-training.csv')
test_df.to_csv('data/cerpen-test.csv')

Unnamed: 0,title,source,authors,categories,text
0,Adios,http://cerpenmu.com/cerpen-perpisahan/adios.html,['Salman Reza Al-Fachrezy'],"['Cerpen Pengalaman Pribadi', 'Cerpen Perpisah...","Lady, begitulah nama dia. \r\nDia adalah anak ..."
1,My Love Story,http://cerpenmu.com/cerpen-cinta-segitiga/my-l...,['Sherly Milenia Islamiati'],['Cerpen Cinta Segitiga'],“anggap saja malam ini kita pacaran” kata kata...
2,Menunggu,http://cerpenmu.com/cerpen-cinta-dalam-hati-te...,['Yulia Nurhasanah'],['Cerpen Cinta Dalam Hati (Terpendam)'],"Mengaguminya dalam diam, setiap detik hanya bi..."
3,Zea dan Persahabatan,http://cerpenmu.com/cerpen-remaja/zea-dan-pers...,['Na'],['Cerpen Remaja'],Pagi ini Zidan bermaksud menyapa Zea. Sekaligu...
4,Hadiah Untuk Rysta,http://cerpenmu.com/cerpen-persahabatan/hadiah...,['Deshinta Maharani'],"['Cerpen Anak', 'Cerpen Persahabatan']","Hai, namaku Shofieya Rika Alyanabila, temanku ..."
