# Introduction

This notebook is an implementation of the tutorial 'Text classification from scratch' (https://keras.io/examples/nlp/text_classification_from_scratch/) into fake news classification.

# Setup

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
import tensorflow as tf
from keras import layers
import string
import re
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news/news.csv


In [2]:
df = pd.read_csv('/kaggle/input/fake-news/news.csv', index_col=0)

In [3]:
df['label'] = df['label'].replace('REAL', '0')
df['label'] = df['label'].replace('FAKE', '1')
df['label'] = df['label'].astype('int32')

In [4]:
df.drop(columns=['title'], inplace=True)

In [5]:
df

Unnamed: 0,text,label
8476,"Daniel Greenfield, a Shillman Journalism Fello...",1
10294,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
3608,U.S. Secretary of State John F. Kerry said Mon...,0
10142,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
875,It's primary day in New York and front-runners...,0
...,...,...
4490,The State Department told the Republican Natio...,0
8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1
8622,Anti-Trump Protesters Are Tools of the Oligar...,1
4021,"ADDIS ABABA, Ethiopia —President Obama convene...",0


# Data Preparation

In [6]:
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [7]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


max_features = 20000
embedding_dim = 128
sequence_length = 500

# Text Vectorization

In [8]:
vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [9]:
text_ds = tf.data.Dataset.from_tensor_slices(train_df['text'].values)
vectorize_layer.adapt(text_ds)

In [10]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    label = tf.expand_dims(label, -1)
    return vectorize_layer(text), label

train_ds = tf.data.Dataset.from_tensor_slices((train_df['text'].values, train_df['label'].values))
train_ds = train_ds.map(vectorize_text)

val_ds = tf.data.Dataset.from_tensor_slices((valid_df['text'].values, valid_df['label'].values))
val_ds = val_ds.map(vectorize_text)

test_ds = tf.data.Dataset.from_tensor_slices((test_df['text'].values, test_df['label'].values))
test_ds = test_ds.map(vectorize_text)

train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

# Model, 1D convnet starting with an Embedding layer

In [11]:
inputs = keras.Input(shape=(None,), dtype="int64")

x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [12]:
epochs = 5

model.fit(train_ds, validation_data=val_ds, epochs=epochs, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ea2cdcd3ca0>