In [192]:
import pandas as pd
from tensorflow.keras.layers import Embedding, Layer, MultiHeadAttention, Input, Dense, LayerNormalization, TextVectorization, GlobalAveragePooling1D, InputLayer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
from sklearn.model_selection import train_test_split
from tensorflow import convert_to_tensor, string, float32, shape, range, reshape
from sklearn.metrics import accuracy_score, roc_auc_score, RocCurveDisplay, classification_report
import numpy as np

In [241]:
df = pd.read_csv("full_clean_checkpoint.csv")
X_train, X_test, y_train, y_test = train_test_split(df.article, df.label, 
                                                     test_size=0.2, stratify=df.label, 
                                                     random_state=42)

In [243]:
def word_proc_new(data):
    if not isinstance(data, str):
        return None  # Mark for removal later

    try:
        data = data.translate(str.maketrans('', '', s.punctuation))
        data = data.lower()

        stems = stemmer_tl.get_stems(data)
        if not isinstance(stems, list):
            return None  # Mark for removal

        data_stem = []
        for word in stems:
            if isinstance(word, str) and word:
                lemma = simplemma.lemmatize(word, lang='en', greedy=True)
                data_stem.append(lemma)

        data_sw_tl = [word for word in data_stem if word not in stopwords_tl]
        data_sw_en = [word for word in data_sw_tl if word not in stopwords_en]
        data_final = " ".join(data_sw_en)

        return data_final

    except Exception as e:
        # Optional: log or print for debugging
        print(f"Error processing: {data.index} | Exception: {e}")
        return None  # Mark for removal

In [245]:
class EmbeddingLayer(Layer):
    
    def __init__(self, seq_length, dict_size, richness):
        super(EmbeddingLayer, self).__init__()
        self.WordEmbedding =  Embedding(input_dim = dict_size, output_dim=richness)
        self.Pos_Embedding =  Embedding(input_dim = seq_length, output_dim=richness)

    def call(self, tokens):
        sequence_length = shape(tokens)[-1]
        all_positions =  range(start=0, limit=seq_length, delta=1)
        pos_enc = self.Pos_Embedding(all_positions)
        word_enc = self.WordEmbedding(tokens)
        return pos_enc + word_enc

In [247]:
class EncodingLayer(Layer):
    
    def __init__(self, Multi_heads, Tot_dense, richness):
        super(EncodingLayer, self).__init__()
        self.MultiHead =  MultiHeadAttention(num_heads = Multi_heads, key_dim = richness)
        self.MiniMLP =  Sequential([Dense(Tot_dense, activation='relu'), Dense(richness)])
        self.NormalizeLayer = LayerNormalization()
         
    def call(self, inputs):
        Multihead_output = self.MultiHead(inputs, inputs)
        normalize_multihead =  self.NormalizeLayer(inputs + Multihead_output)
        MiniMLP_out = self.MiniMLP(normalize_multihead)
        final_output = self.NormalizeLayer(inputs + MiniMLP_out)
        return final_output

In [249]:
X_train

2649    usapusapan social medium umanoy gaya katawan t...
2800    akala daw testigo luhod suspek harap altar dal...
570     bitbit mandaragat espanyol pinya lakbay upan i...
2452    consider filing legal action impose sanction p...
1810    sapul department environment natural resource ...
                              ...                        
1795    sitting let pass ganti gawa san kongresista si...
2023    tila naghuhurumentado lang militant estudyante...
2139    tawag san opisyal baha katolika netizen bawasb...
2893    bula 6 gamit publiko locallymake hybrid electr...
2733    general ronald bato dela rosa napahagolgol senado
Name: article, Length: 2527, dtype: object

In [333]:
## Text Vectorization
vocab_size= 3000
seq_length = 300
train_X_tensor = convert_to_tensor(X_train)
vectorizer = TextVectorization(output_sequence_length = seq_length, max_tokens=30)

vectorizer.adapt(train_X_tensor)

train_X_tensors = convert_to_tensor(X_train, dtype=string)
train_X_vector = vectorizer(train_X_tensors)

test_X_tensors = convert_to_tensor(X_test, dtype=string)
test_X_vector = vectorizer(test_X_tensors)


In [349]:
richness =  64
heads = 3
total_dense = 20

embedding = EmbeddingLayer(seq_length, vocab_size, richness)
encoding = EncodingLayer(Multi_heads = heads, Tot_dense = total_dense, richness=richness)

inputs = Input(shape=(seq_length, ))
emb = embedding(inputs)
enc = encoding(emb)
pool = GlobalAveragePooling1D()(enc)
d = Dense(5, activation='relu')(pool)
outputs = Dense(1, activation = 'sigmoid')(d)

In [337]:
transformer = Model(inputs=inputs, outputs=outputs)
transformer.compile(optimizer=Adam(learning_rate = 3e-4), loss="binary_crossentropy", metrics=['accuracy'])
transformer.summary()

In [339]:
history = transformer.fit(train_X_vector, y_train, epochs=50, validation_split=0.2)

Epoch 1/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 105ms/step - accuracy: 0.7312 - loss: 0.6001 - val_accuracy: 0.7668 - val_loss: 0.5160
Epoch 2/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 98ms/step - accuracy: 0.7701 - loss: 0.5011 - val_accuracy: 0.7648 - val_loss: 0.4998
Epoch 3/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 99ms/step - accuracy: 0.7805 - loss: 0.4808 - val_accuracy: 0.7648 - val_loss: 0.4943
Epoch 4/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 97ms/step - accuracy: 0.7687 - loss: 0.4926 - val_accuracy: 0.7688 - val_loss: 0.4909
Epoch 5/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 91ms/step - accuracy: 0.7793 - loss: 0.4766 - val_accuracy: 0.7727 - val_loss: 0.4875
Epoch 6/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 92ms/step - accuracy: 0.7805 - loss: 0.4973 - val_accuracy: 0.7747 - val_loss: 0.4862
Epoch 7/50
[1m64/64[0m [32m━━━

In [341]:
pred = transformer.predict(test_X_vector)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step


In [343]:
report = classification_report(y_test, np.round(pred), output_dict=True)
pd.DataFrame(report).T


Unnamed: 0,precision,recall,f1-score,support
0,0.869048,0.915361,0.891603,319.0
1,0.908784,0.859425,0.883415,313.0
accuracy,0.887658,0.887658,0.887658,0.887658
macro avg,0.888916,0.887393,0.887509,632.0
weighted avg,0.888727,0.887658,0.887548,632.0


In [345]:
accuracy_score(y_test, np.round(pred))

0.8876582278481012

In [347]:
transformer.save('fake_news_v1.keras')