In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

In [3]:
stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stop_words(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def pandas_df(csv):
    df = pd.read_csv(r"../../Resources/Cleaned/"+csv+".csv")
    return df

def test(data, label):

    # Preprocess the text
    new_paragraphs = data["Text"].apply(remove_stop_words).values


    # Tokenize and pad sequences
    new_sequences = tokenizer.texts_to_sequences(new_paragraphs)
    new_data = pad_sequences(new_sequences, maxlen=maxlen)

    # Predict
    predictions = model.predict(new_data)
    predicted_classes = [1 if prob > 0.5 else 0 for prob in predictions.ravel()]

    true_labels = [label] * len(predicted_classes)

    # Classification Report
    report = classification_report(true_labels, predicted_classes, target_names=['Class 0', 'Class 1'])
    print(report)

In [4]:
de_df = pandas_df("dying_earth_corpus")
not_de_df = pandas_df("not_dying_earth_corpus")
km_df = pandas_df("killing_machine_paragraphs")
android_df = pandas_df("android_paragraphs")
stardust_df = pandas_df("stardust_paragraphs")

In [5]:
km_df["Is_Dying_Earth"] = 0
android_df["Is_Dying_Earth"] = 0
stardust_df["Is_Dying_Earth"] = 0

In [6]:
corpus = pd.concat([de_df, not_de_df], axis=0, ignore_index=True)
corpus = corpus.sample(frac=1, random_state=1).reset_index(drop=True)

In [7]:
corpus

Unnamed: 0,Title,Text,Is_Dying_Earth
0,Cugel's Saga,"Some days later, while strolling the esplanade...",1
1,Tom Sawyer,Then they waited in silence for what seemed a ...,0
2,Cosmos,"If the world is to be understood, if we are to...",0
3,Into Thin Air,"But Frank, the gentlemanly, quiet-spoken publi...",0
4,Rhialto the Marvellous,"""None whatever.""",1
...,...,...,...
8710,Rhialto the Marvellous,"Rhialto gazed in all directions. The music, or...",1
8711,1984,His earlier thought returned to him: probably ...,0
8712,The Dying Earth,"""Quick,"" said Guyal to Shierl. She sprang to t...",1
8713,Cosmos,"For Saturn as for Jupiter, the magnetic field ...",0


In [8]:
average_length = corpus['Text'].apply(len).mean()
average_length

438.29833620195063

In [9]:
# Applying stop word removal to each text in the corpus
paragraphs = corpus["Text"].apply(remove_stop_words).values

# Tokenization
max_words = 25000
tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(tqdm(paragraphs, desc="Tokenizing"))
sequences = tokenizer.texts_to_sequences(tqdm(paragraphs, desc="Converting to Sequences"))

# Padding sequences
maxlen = 438
data = pad_sequences(sequences, maxlen=maxlen)

# Labels
labels = corpus["Is_Dying_Earth"].values

early_stopping = EarlyStopping(monitor='val_loss', patience=2)

# Model definition
model = Sequential()
model.add(Embedding(max_words, 128, input_length=maxlen))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Tokenizing: 100%|██████████| 8715/8715 [00:00<00:00, 17558.16it/s]
Converting to Sequences: 100%|██████████| 8715/8715 [00:00<00:00, 23635.04it/s]


In [15]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping])

# Evaluate
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Test accuracy:', test_acc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Test accuracy: 0.9730349779129028


In [16]:
y_pred = model.predict(x_test)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred.ravel()]

# Generating the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       806
           1       0.98      0.97      0.97       937

    accuracy                           0.97      1743
   macro avg       0.97      0.97      0.97      1743
weighted avg       0.97      0.97      0.97      1743



In [17]:
test(android_df, 0)

              precision    recall  f1-score   support

     Class 0       1.00      0.77      0.87       976
     Class 1       0.00      0.00      0.00         0

    accuracy                           0.77       976
   macro avg       0.50      0.39      0.44       976
weighted avg       1.00      0.77      0.87       976



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
test(stardust_df, 0)

              precision    recall  f1-score   support

     Class 0       1.00      0.81      0.90      1055
     Class 1       0.00      0.00      0.00         0

    accuracy                           0.81      1055
   macro avg       0.50      0.41      0.45      1055
weighted avg       1.00      0.81      0.90      1055



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
test(km_df, 0)

              precision    recall  f1-score   support

     Class 0       1.00      0.39      0.56       634
     Class 1       0.00      0.00      0.00         0

    accuracy                           0.39       634
   macro avg       0.50      0.20      0.28       634
weighted avg       1.00      0.39      0.56       634



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model.save('is_dying_earth_model.keras')

In [None]:
import pickle

with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)