In [9]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

In [10]:
stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stop_words(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def pandas_df(csv):
    df = pd.read_csv(r"../../Resources/Cleaned/"+csv+".csv")
    return df

def test(data, label):

    # Preprocess the text
    new_paragraphs = data["Text"].apply(remove_stop_words).values


    # Tokenize and pad sequences
    new_sequences = tokenizer.texts_to_sequences(new_paragraphs)
    new_data = pad_sequences(new_sequences, maxlen=maxlen)

    # Predict
    predictions = model.predict(new_data)
    predicted_classes = [1 if prob > 0.5 else 0 for prob in predictions.ravel()]

    true_labels = [label] * len(predicted_classes)

    # Classification Report
    report = classification_report(true_labels, predicted_classes, target_names=['Class 0', 'Class 1'])
    print(report)

In [11]:
de_df = pandas_df("dying_earth_corpus")
not_de_df = pandas_df("not_dying_earth_corpus")
km_df = pandas_df("killing_machine_paragraphs")
android_df = pandas_df("android_paragraphs")
stardust_df = pandas_df("stardust_paragraphs")

In [12]:
km_df["Is_Dying_Earth"] = 0
android_df["Is_Dying_Earth"] = 0
stardust_df["Is_Dying_Earth"] = 0

In [13]:
corpus = pd.concat([de_df, not_de_df], axis=0, ignore_index=True)
corpus = corpus.sample(frac=1, random_state=1).reset_index(drop=True)

In [14]:
corpus

Unnamed: 0,Title,Text,Is_Dying_Earth
0,Cugel's Saga,"Some days later, while strolling the esplanade...",1
1,Tom Sawyer,Then they waited in silence for what seemed a ...,0
2,Cosmos,"If the world is to be understood, if we are to...",0
3,Into Thin Air,"But Frank, the gentlemanly, quiet-spoken publi...",0
4,Rhialto the Marvellous,"""None whatever.""",1
...,...,...,...
8710,Rhialto the Marvellous,"Rhialto gazed in all directions. The music, or...",1
8711,1984,His earlier thought returned to him: probably ...,0
8712,The Dying Earth,"""Quick,"" said Guyal to Shierl. She sprang to t...",1
8713,Cosmos,"For Saturn as for Jupiter, the magnetic field ...",0


In [15]:
average_length = corpus['Text'].apply(len).mean()
average_length

438.29833620195063

In [16]:
# Applying stop word removal to each text in the corpus
paragraphs = corpus["Text"].apply(remove_stop_words).values

# Tokenization
max_words = 25000
tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(tqdm(paragraphs, desc="Tokenizing"))
sequences = tokenizer.texts_to_sequences(tqdm(paragraphs, desc="Converting to Sequences"))

# Padding sequences
maxlen = 438
data = pad_sequences(sequences, maxlen=maxlen)

# Labels
labels = corpus["Is_Dying_Earth"].values

early_stopping = EarlyStopping(monitor='val_loss', patience=2)

# Model definition
model = Sequential()
model.add(Embedding(max_words, 128, input_length=maxlen))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Tokenizing: 100%|██████████| 8715/8715 [00:00<00:00, 36660.85it/s]
Converting to Sequences: 100%|██████████| 8715/8715 [00:00<00:00, 46690.59it/s]


In [17]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping])

# Evaluate
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Test accuracy:', test_acc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Test accuracy: 0.9713138341903687


In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 438, 128)          3200000   
                                                                 
 lstm (LSTM)                 (None, 32)                20608     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 3220641 (12.29 MB)
Trainable params: 3220641 (12.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
y_pred = model.predict(x_test)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred.ravel()]

# Generating the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       815
           1       0.97      0.98      0.97       928

    accuracy                           0.97      1743
   macro avg       0.97      0.97      0.97      1743
weighted avg       0.97      0.97      0.97      1743



In [19]:
test(android_df, 0)

              precision    recall  f1-score   support

     Class 0       1.00      0.98      0.99       121
     Class 1       0.00      0.00      0.00         0

    accuracy                           0.98       121
   macro avg       0.50      0.49      0.50       121
weighted avg       1.00      0.98      0.99       121



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
test(stardust_df, 0)

              precision    recall  f1-score   support

     Class 0       1.00      0.94      0.97       100
     Class 1       0.00      0.00      0.00         0

    accuracy                           0.94       100
   macro avg       0.50      0.47      0.48       100
weighted avg       1.00      0.94      0.97       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
test(km_df, 0)

              precision    recall  f1-score   support

     Class 0       1.00      0.31      0.47       634
     Class 1       0.00      0.00      0.00         0

    accuracy                           0.31       634
   macro avg       0.50      0.16      0.24       634
weighted avg       1.00      0.31      0.47       634



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
model.save('is_dying_earth_model.keras')

In [28]:
import pickle

with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

In [30]:
from keras.models import load_model

pred_model = load_model(r"C:\Users\londo\01\001\Repos\Sfere\Models\Keras\is_dying_earth_model.keras")


In [31]:
pred_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 438, 128)          3200000   
                                                                 
 lstm (LSTM)                 (None, 32)                20608     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 3220641 (12.29 MB)
Trainable params: 3220641 (12.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
