In [33]:
import pandas as pd

df = pd.read_csv('./dataset/data.csv')
df['text'] = df['title'] + df['summary']

df.drop(columns=['index', 'title', 'summary'], inplace=True)


In [34]:
df.head()

Unnamed: 0,genre,text
0,fantasy,Drowned Wednesday Drowned Wednesday is the fir...
1,fantasy,"The Lost Hero As the book opens, Jason awakens..."
2,fantasy,The Eyes of the Overworld Cugel is easily pers...
3,fantasy,Magic's Promise The book opens with Herald-Mag...
4,fantasy,Taran Wanderer Taran and Gurgi have returned t...


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4657 entries, 0 to 4656
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   genre   4657 non-null   object
 1   text    4657 non-null   object
dtypes: object(2)
memory usage: 72.9+ KB


In [36]:
df.shape

(4657, 2)

In [37]:
classes = df['genre'].unique()
classes, classes.shape

(array(['fantasy', 'science', 'crime', 'history', 'horror', 'thriller',
        'psychology', 'romance', 'sports', 'travel'], dtype=object),
 (10,))

In [38]:
classes_count_dct = {}

for index, row in df.iterrows():
    if row['genre'] in classes_count_dct:
        classes_count_dct[row['genre']] += 1
    else:
        classes_count_dct[row['genre']] = 1

print(classes_count_dct)

{'fantasy': 876, 'science': 647, 'crime': 500, 'history': 600, 'horror': 600, 'thriller': 1023, 'psychology': 100, 'romance': 111, 'sports': 100, 'travel': 100}


In [39]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

df['text'] = df['text'].apply(preprocess_text)

In [40]:
import random
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

def max_dct_count(dct):
    return max(dct.values())

def calc_alphas(classes, max_count, count_dct):
    return {cls: 1 - count_dct[cls]/max_count for cls in classes}

def join_lists(lst):
    final = []
    for l in lst:
        final.extend(l)
    return final

def word_replacement(genre, text):
    final = ''
    for word in text.split():
        synonyms = join_lists(wordnet.synonyms(word))
        
        choice = word

        if synonyms and len(word) >= 3:
            synonyms = synonyms[0]
            threshold = 0.7 # 70% of words will be changed
            choice = random.choice(synonyms).lower() if random.uniform(0, 1) < threshold else choice

        final += choice + ' '

    return {'genre': genre, 'text': final.strip()} 

def data_aug(df, count_dct, classes):
    max_count = max_dct_count(count_dct)
    alphas_dct = calc_alphas(classes, max_count, count_dct)
    total_added = 0

    for index, row in df.iterrows():
        cls = row['genre']

        while alphas_dct[cls] > random.uniform(0, 1) and count_dct[cls] < max_count:
            df = df._append(word_replacement(cls, row['text']), ignore_index=True)
            total_added += 1
            count_dct[cls] += 1

        if index % 10 == 0:
            print(f'{index} samples augmented, {total_added} new samples added')

    print('done')
    return df

[nltk_data] Downloading package wordnet to /home/tom/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [41]:
df = data_aug(df, classes_count_dct, classes)

0 samples augmented, 1 new samples added
10 samples augmented, 4 new samples added
20 samples augmented, 7 new samples added
30 samples augmented, 7 new samples added
40 samples augmented, 8 new samples added
50 samples augmented, 9 new samples added
60 samples augmented, 10 new samples added
70 samples augmented, 11 new samples added
80 samples augmented, 13 new samples added
90 samples augmented, 13 new samples added
100 samples augmented, 13 new samples added
110 samples augmented, 14 new samples added
120 samples augmented, 16 new samples added
130 samples augmented, 17 new samples added
140 samples augmented, 19 new samples added
150 samples augmented, 21 new samples added
160 samples augmented, 23 new samples added
170 samples augmented, 25 new samples added
180 samples augmented, 26 new samples added
190 samples augmented, 31 new samples added
200 samples augmented, 34 new samples added
210 samples augmented, 35 new samples added
220 samples augmented, 38 new samples added
230 s

In [42]:
classes_count_dct = {}

for index, row in df.iterrows():
    if row['genre'] in classes_count_dct:
        classes_count_dct[row['genre']] += 1
    else:
        classes_count_dct[row['genre']] = 1

print(classes_count_dct)

{'fantasy': 1023, 'science': 1023, 'crime': 1023, 'history': 991, 'horror': 996, 'thriller': 1023, 'psychology': 1023, 'romance': 1023, 'sports': 1023, 'travel': 971}


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df['text']).toarray()

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
y = lb.fit_transform(df['genre'])

X = X.astype('float32')
y = y.astype('float32')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8095, 10000), (2024, 10000), (8095,), (2024,))

In [45]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_accuracy', mode='auto', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', patience=3, verbose=1, factor=0.5, min_lr=0.0000000001)

In [46]:
epochs = 50
batch_size = 128

In [47]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization

model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(classes.shape[0], activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 32)                320032    
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 batch_normalization_2 (Bat  (None, 32)                128       
 chNormalization)                                                
                                                                 
 dense_5 (Dense)             (None, 10)                330       
                                                                 
Total params: 320490 (1.22 MB)
Trainable params: 320426 (1.22 MB)
Non-trainable params: 64 (256.00 Byte)
_________________________________________________________________


In [48]:
output = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                   epochs=epochs, batch_size=batch_size,
                   callbacks=[early_stopping, reduce_lr])

path = './model.h5'
model.save(path)

eval = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)

print(eval)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 16: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 17/50
Epoch 18/50

  saving_api.save_model(


[0.384943425655365, 0.8972331881523132]
