Read in Data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

trainPath = "gdrive/My Drive/fake news/data/train.csv"
testPath = "gdrive/My Drive/fake news/data/test.csv"
testLabelPath = "gdrive/My Drive/fake news/data/submit.csv"

!pip install --upgrade tensorflow
!pip install --upgrade keras

Collecting keras
  Downloading keras-3.5.0-py3-none-any.whl.metadata (5.8 kB)
Downloading keras-3.5.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.4.1
    Uninstalling keras-3.4.1:
      Successfully uninstalled keras-3.4.1
Successfully installed keras-3.5.0


In [None]:
def readData(filePath):
  data = pd.read_csv(filePath)
  return data

trainData = readData(trainPath)
testData = readData(testPath)
testLabels = readData(testLabelPath)

In [None]:
print(trainData.shape)
print(trainData.columns)
trainData.head()

(20800, 5)
Index(['id', 'title', 'author', 'text', 'label'], dtype='object')


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


Data Preparation

In [None]:
textLen = trainData.text.str.split().str.len()
textLen.describe()

Unnamed: 0,text
count,20761.0
mean,760.308126
std,869.525988
min,0.0
25%,269.0
50%,556.0
75%,1052.0
max,24234.0


In [None]:
titleLen = trainData.title.str.split().str.len()
titleLen.describe()

Unnamed: 0,title
count,20242.0
mean,12.420709
std,4.098735
min,1.0
25%,10.0
50%,13.0
75%,15.0
max,72.0


In [None]:
columns = testData.columns
toRemove = ['id', 'author']
target = ['label']
features = ['title', 'text']

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
import re
from collections import Counter

ps = nltk.stem.porter.PorterStemmer()
wnl = nltk.stem.WordNetLemmatizer()

stopWords = stopwords.words('english')
stopwordsDict = Counter(stopWords)

def removeUnused(df, columnNames = toRemove):
  df = df.drop(columnNames, axis = 1)
  return df

def replaceNull(df):
  for col in features:
    df.loc[df[col].isnull(), col] = ''
  return df

def nltkPreprocess(text):
  text = str(text).replace(r'http[\w:/\.]+', ' ')
  text = str(text).replace(r'[^\.\w\s]', ' ')
  text = str(text).replace('[^a-zA-Z]', ' ')
  text = str(text).replace(r'\s\s+', ' ')
  text = text.lower().strip()
  wordList = re.sub(r'[^\w\s]', '', text).split()
  text = ' '.join([wnl.lemmatize(word) for word in wordList if word not in stopwordsDict])
  return text

trainData = removeUnused(trainData)
testData = removeUnused(testData)
trainData = replaceNull(trainData)
testData = replaceNull(testData)

trainData['text'] = trainData.text.apply(nltkPreprocess)
trainData['title'] = trainData.title.apply(nltkPreprocess)
testData['text'] = testData.text.apply(nltkPreprocess)
testData['title'] = testData.title.apply(nltkPreprocess)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
trainData.head()

Unnamed: 0,title,text,label
0,house dem aide didnt even see comeys letter ja...,house dem aide didnt even see comeys letter ja...,1
1,flynn hillary clinton big woman campus breitbart,ever get feeling life circle roundabout rather...,0
2,truth might get fired,truth might get fired october 29 2016 tension ...,1
3,15 civilian killed single u airstrike identified,video 15 civilian killed single u airstrike id...,1
4,iranian woman jailed fictional unpublished sto...,print iranian woman sentenced six year prison ...,1


In [None]:
def prepareDataset(df):
  df['news'] = df.title + ' ' + df.text
  x = df['news']
  y = df.label
  if y.dtype == 'object':
    y = y.astype('int')
  return x, y

trainX, trainY = prepareDataset(trainData)

In [None]:
!pip install keras-preprocessing
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer

Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2


In [None]:
oovToken = '<oov>'
vocabSize = 10000
maxTextLength = 100

tokenizer = Tokenizer(num_words = vocabSize, oov_token = oovToken)
tokenizer.fit_on_texts(trainX)
wordIndex = tokenizer.word_index
print(len(wordIndex.keys()))

201482


In [None]:
def prepareSequenceData(df, tokenizer):
  textSequences = tokenizer.texts_to_sequences(df)
  print(textSequences[0:1])
  textSequences = pad_sequences(textSequences, maxlen = maxTextLength, truncating = "post", padding = "post")
  print(textSequences[0:1])
  return textSequences

In [None]:
trainTextSeq = prepareSequenceData(trainX, tokenizer)

print(trainTextSeq[0:1])
print(tokenizer.document_count)
print(len(tokenizer.word_counts))
print(trainTextSeq.shape)

[[41, 4596, 928, 262, 20, 63, 2247, 542, 2686, 6401, 2707, 41, 4596, 928, 262, 20, 63, 2247, 542, 2686, 6401, 2707, 1, 1, 297, 560, 78, 2927, 2686, 6401, 9271, 17, 1, 3675, 576, 4143, 635, 1, 892, 2474, 1, 2123, 3737, 5166, 1, 1099, 1417, 308, 27, 1, 204, 650, 507, 87, 41, 190, 928, 134, 14, 16, 40, 1, 308, 60, 393, 507, 541, 1, 542, 3726, 179, 547, 69, 39, 983, 28, 13, 69, 1117, 4422, 141, 2904, 356, 262, 1062, 507, 133, 1025, 1382, 5, 47, 356, 753, 40, 507, 6517, 47, 753, 190, 4422, 101, 41, 388, 3676, 3764, 356, 245, 4985, 69, 452, 1359, 153, 63, 3026, 1472, 183, 113, 542, 304, 3764, 356, 753, 2686, 6401, 226, 67, 27, 1, 1382, 179, 1, 2468, 179, 1185, 2187, 69, 1067, 6729, 166, 82, 4366, 2686, 6401, 1, 297, 1131, 78, 315, 40, 82, 507, 339, 233, 4985, 69, 616, 6314, 1, 40, 1826, 4033, 6852, 2681, 1327, 185, 66, 158, 262, 254, 6401, 3675, 47, 188, 3189, 8938, 1, 166, 28, 1, 196, 26, 11, 1035, 1634, 643, 403, 1035, 1327, 6401, 332, 179, 188, 57, 1, 1382, 3868, 1, 116, 8614, 318, 2797, 

In [None]:
glovePath = "gdrive/My Drive/fake news/glove.6B"
vocabSize = len(wordIndex) + 1
embDim = 100

def buildEmbeddings(tokenizer):
  wordVec = pd.read_table(glovePath + "/glove.6B.100d.txt", sep = r"\s+", header = None, engine = 'python', encoding = 'utf-8', on_bad_lines = 'skip')
  wordVec.set_index(0, inplace = True)

  embeddingMatrix = np.zeros((vocabSize, embDim))
  idxNWord = [(i, tokenizer.index_word[i]) for i in range(1, len(embeddingMatrix)) if tokenizer.index_word[i] in wordVec.index]
  idx, word = zip(*idxNWord)
  embeddingMatrix[idx, :] = wordVec.loc[word, :].values
  return embeddingMatrix

In [None]:
from tensorflow.keras.layers import Embedding

embeddingMatrix = buildEmbeddings(tokenizer)
print("encoded word sequences:", embeddingMatrix[0:10])
embeddingLayer = Embedding(input_dim = vocabSize, output_dim = embDim, input_length = maxTextLength, weights = [embeddingMatrix], trainable = False)

encoded word sequences: [[ 0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00 



Create Neural Network

In [None]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    embeddingLayer,
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])
model.build(input_shape=(None, maxTextLength))
model.summary()

(20800,) (20800,)


In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
history = model.fit(trainTextSeq, trainY, epochs = 20, batch_size = 256, validation_split = 0.2)

Epoch 1/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 454ms/step - accuracy: 0.6345 - loss: 0.6475 - val_accuracy: 0.7267 - val_loss: 0.5431
Epoch 2/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 452ms/step - accuracy: 0.6532 - loss: 0.5916 - val_accuracy: 0.6668 - val_loss: 0.5916
Epoch 3/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 420ms/step - accuracy: 0.7017 - loss: 0.5569 - val_accuracy: 0.7423 - val_loss: 0.4662
Epoch 4/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 448ms/step - accuracy: 0.7002 - loss: 0.5908 - val_accuracy: 0.7810 - val_loss: 0.4923
Epoch 5/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 454ms/step - accuracy: 0.7367 - loss: 0.5459 - val_accuracy: 0.6529 - val_loss: 0.5994
Epoch 6/20
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 450ms/step - accuracy: 0.6886 - loss: 0.5849 - val_accuracy: 0.6192 - val_loss: 0.6202
Epoch 7/20
[1m65/65[

In [None]:
testData.head()

Unnamed: 0,title,text,news
0,specter trump loosens tongue purse string sili...,palo alto calif year scorning political proces...,specter trump loosens tongue purse string sili...
1,russian warship ready strike terrorist near al...,russian warship ready strike terrorist near al...,russian warship ready strike terrorist near al...
2,nodapl native american leader vow stay winter ...,video nodapl native american leader vow stay w...,nodapl native american leader vow stay winter ...
3,tim tebow attempt another comeback time baseba...,first dont succeed try different sport tim teb...,tim tebow attempt another comeback time baseba...
4,keiser report meme war e995,42 min ago 1 view 0 comment 0 like first time ...,keiser report meme war e995 42 min ago 1 view ...


In [None]:
testLabels.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1


In [None]:
testData['label'] = testLabels.label
testX, testY = prepareDataset(testData)
testTextSeq = prepareSequenceData(testX, tokenizer)
score = model.evaluate(testTextSeq, testY, verbose = 0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

[[9385, 4, 1, 7627, 9834, 3306, 3365, 1996, 8, 35, 10, 1, 1, 3186, 11, 1, 67, 385, 3365, 1996, 1, 1, 2094, 15, 72, 441, 4, 1955, 1912, 269, 300, 826, 1336, 374, 1427, 2717, 8, 2187, 1, 820, 8880, 3198, 844, 772, 1, 923, 107, 707, 1, 1, 5, 1666, 1912, 1, 25, 1514, 3, 4, 31, 7, 1561, 4999, 2621, 89, 4248, 3758, 415, 3185, 1912, 132, 1250, 327, 542, 1, 3, 4, 38, 2031, 5667, 890, 250, 1762, 8041, 1526, 1, 1, 24, 456, 1314, 359, 924, 47, 1284, 950, 8, 35, 10, 214, 690, 3, 8041, 1007, 149, 849, 65, 108, 3, 4, 2960, 20, 386, 4973, 16, 452, 667, 149, 65, 2010, 3899, 108, 412, 5247, 1, 47, 6889, 608, 1547, 469, 651, 489, 135, 310, 1, 3365, 1996, 631, 117, 5014, 27, 287, 170, 9332, 9980, 121, 4488, 50, 1, 1127, 3526, 1195, 1384, 1922, 1, 620, 2045, 3639, 1, 135, 197, 1, 2, 3982, 1, 1314, 281, 1912, 1, 109, 1393, 1, 2179, 182, 103, 76, 182, 1303, 3, 1, 1517, 1854, 47, 190, 1284, 3, 4, 342, 2801, 28, 13, 459, 474, 3365, 1996, 549, 33, 94, 557, 1, 7734, 217, 2075, 53, 1427, 3022, 708, 209, 818, 791