In [2]:
##step 1: importing 
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#load the dataset

df = pd.read_csv("../data/news.csv")

#inspect the dataset 

print ("shape: ", df.shape)
print ("Columns: ", df.columns)
df.head()

  if not hasattr(np, "object"):


shape:  (6335, 4)
Columns:  Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
df.drop(columns=['Unnamed: 0'], inplace=True)

#data gets encoded
encoder=LabelEncoder()
df['label']=encoder.fit_transform(df['label'])
#fit transform learns from the mapping and converts real to 1 and fake to 0

#check the result

print("Label Distribution:")
print(df['label'].value_counts())
df.head()

Label Distribution:
label
1    3171
0    3164
Name: count, dtype: int64


Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


In [6]:
#configuration variables for preprocessing and model setup

embedding_dim=100
#size of word embedding vectors, each word is represented as a 100-d vector in embedding layer to capture more nuance

max_length=100
#max length of input sequence (number of tokens per headline)  headlines longer than 100 words cutoff, short ones will be padded so that each input has same length

trunc_type='post'
#if text > max_length, trunc from end, 'pre' would mean from begining

padding_type='post'
#if text <max_length, pad 0's at th end

oov_tok="<OOV>" 
#token used for words not in the vocabulary, any word not seen in tarining replaced by this to prevent errors in case of new words

training_portion=0.8
#80% data used for training


In [7]:
#step 8: tokenization, bridging the gap between human words and machine language models

titles=df['title'].values #extract the titles as input a numpy array
tokenizer=Tokenizer(oov_token=oov_tok) #initialize the tokenizer , an object created
tokenizer.fit_on_texts(titles) #place titles on it, creates mapping, creates vocabulary
sequences=tokenizer.texts_to_sequences(titles) #convert titles to sequence of integers
padded=pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print("Vocabulary size:", len(tokenizer.word_index))
print("Shape of padded sequences:", padded.shape)

Vocabulary size: 11722
Shape of padded sequences: (6335, 100)
