In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

In [None]:
stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stop_words(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def pandas_df(csv):
    df = pd.read_csv(r"../../Resources/Cleaned/"+csv+".csv")
    return df

def test(data, label):

    # Preprocess the text
    new_paragraphs = data["Text"].apply(remove_stop_words).values


    # Tokenize and pad sequences
    new_sequences = tokenizer.texts_to_sequences(new_paragraphs)
    new_data = pad_sequences(new_sequences, maxlen=maxlen)

    # Predict
    predictions = model.predict(new_data)
    predicted_classes = [1 if prob > 0.5 else 0 for prob in predictions.ravel()]

    true_labels = [label] * len(predicted_classes)

    # Classification Report
    report = classification_report(true_labels, predicted_classes, target_names=['Class 0', 'Class 1'])
    print(report)

In [None]:
de_df = pandas_df("dying_earth_corpus")
not_de_df = pandas_df("not_dying_earth_corpus")
km_df = pandas_df("killing_machine_paragraphs")
android_df = pandas_df("android_paragraphs")
stardust_df = pandas_df("stardust_paragraphs")

In [None]:
km_df["Is_Dying_Earth"] = 0
android_df["Is_Dying_Earth"] = 0
stardust_df["Is_Dying_Earth"] = 0

In [None]:
corpus = pd.concat([de_df, not_de_df], axis=0, ignore_index=True)
corpus = corpus.sample(frac=1, random_state=1).reset_index(drop=True)

In [None]:
corpus

In [None]:
average_length = corpus['Text'].apply(len).mean()
average_length

In [None]:
# Applying stop word removal to each text in the corpus
paragraphs = corpus["Text"].apply(remove_stop_words).values

# Tokenization
max_words = 25000
tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(tqdm(paragraphs, desc="Tokenizing"))
sequences = tokenizer.texts_to_sequences(tqdm(paragraphs, desc="Converting to Sequences"))

# Padding sequences
maxlen = 438
data = pad_sequences(sequences, maxlen=maxlen)

# Labels
labels = corpus["Is_Dying_Earth"].values

early_stopping = EarlyStopping(monitor='val_loss', patience=2)

# Model definition
model = Sequential()
model.add(Embedding(max_words, 128, input_length=maxlen))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping])

# Evaluate
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Test accuracy:', test_acc)

In [None]:
model.summary()

In [None]:
y_pred = model.predict(x_test)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred.ravel()]

# Generating the classification report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
test(android_df, 0)

In [None]:
test(stardust_df, 0)

In [None]:
test(km_df, 0)

In [None]:
model.save('is_dying_earth_model.keras')

In [None]:
import pickle

with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

In [None]:
from keras.models import load_model

pred_model = load_model(r"C:\Users\londo\01\001\Repos\Sfere\Models\Keras\is_dying_earth_model.keras")


In [None]:
pred_model.summary()