In [79]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/FakeNews/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import gensim
import gensim.models.keyedvectors as word2vec
import math
from numpy import dot
from numpy.linalg import norm
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [81]:
model = word2vec.KeyedVectors.load_word2vec_format(path+"GoogleNews-vectors-negative300.bin.gz", binary=True)

In [82]:
data = pd.read_csv(path+"data/train.csv")
data = data.dropna()
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [83]:
def preprocess_title(text):
    words = nltk.word_tokenize(text)
    return [i for i in words if i not in stop_words]

In [84]:
def preprocess_text(text):
  sents = nltk.sent_tokenize(text)
  for i in range(len(sents)):
    words = nltk.word_tokenize(sents[i])
    sents[i] = [i for i in words if i not in stop_words]
  return sents

In [85]:
def sent_to_vec(sent, model):
  sent_vector = 0
  no_word = np.array(model['the']).shape[0]
  for word in sent:
      if word not in model:
          #word_vector = np.array(np.random.uniform(-1.0, 1.0, no_word))
          #model[word] = word_vector
          pass
      else:
          word_vector = model[word]
          sent_vector = sent_vector + word_vector
  return sent_vector

In [86]:
def sent_vec_array(sents, model):
  arr = []
  for i in sents:
    arr.append(sent_to_vec(i, model))
  return arr

In [87]:
data['text_tokenized'] = data.text.apply(preprocess_text)
data['title_tokenized'] = data.title.apply(preprocess_title)

In [88]:
data['text_vec'] = data.apply(lambda x: sent_vec_array(x["text_tokenized"],model), axis = 1)
data['title_vec'] = data.apply(lambda x: sent_to_vec(x["title_tokenized"],model), axis = 1)

In [89]:
x = data.copy().drop(columns=['text','title','label','id', 'author'])
y = data.copy()['label']

In [90]:
x_train, x_test = train_test_split(x, test_size=.2)
y_train, y_test = train_test_split(y, test_size=.2)
x_test.head()

Unnamed: 0,text_tokenized,title_tokenized,text_vec,title_vec
14350,"[[A, group, French, suburbs, ,, including, one...","[PICS, :, Arson, Attacks, ,, Unrest, Grows, Mi...","[[0.7786865, 0.52819824, 1.1865234, -0.2548980...","[1.6066895, 0.7624512, -1.3564453, 0.6936035, ..."
4667,"[[Email, What, happens, get, thousands, lawyer...","[Re, :, Trump, And, Clinton, Assemble, Army, O...","[[-0.14202881, -0.061157227, 0.5576477, 1.1676...","[-0.6397705, 1.4130249, 0.09616089, -0.1027526..."
18980,"[[POWERFUL, VIDEO, :, Father, Kayla, Mueller, ...","[POWERFUL, VIDEO, :, Father, Kayla, Mueller, B...","[[4.1844482, 1.119812, 3.1218262, 0.84802246, ...","[1.4472656, 0.13244629, 0.62646484, -0.1593017..."
6681,"[[Budweiser, chosen, charged, political, issue...","[Budweiser, Debuts, Pro-Immigration, Super, Bo...","[[0.09660339, 0.7786865, -0.32104492, 0.542541...","[0.9631958, 0.26245117, -0.6081543, 0.04199218..."
17883,"[[Los, Angeles, (, AFP, ), —, Retired, unbeate...","[Mayweather, ’, Out, Retirement, McGregor, ’, ...","[[0.8227234, 1.3063354, 1.864624, 0.6420593, 1...","[-0.2727661, -0.21386719, -0.099121094, 0.7965..."


In [91]:
def title_text_cos_diff(title_vec, text_vec):
  result = []
  for i in range(len(text_vec)):
    result.append(dot(title_vec, text_vec[i])/(norm(title_vec)*norm(text_vec[i])))
  return result

In [92]:
data['cos_diff'] = data.apply(lambda x: title_text_cos_diff(x["title_vec"], x["text_vec"]), axis = 1)

  after removing the cwd from sys.path.


In [93]:
data.head()

Unnamed: 0,id,title,author,text,label,text_tokenized,title_tokenized,text_vec,title_vec,cos_diff
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,"[[House, Dem, Aide, :, We, Didn, ’, Even, See,...","[House, Dem, Aide, :, We, Didn, ’, Even, See, ...","[[0.54852295, 0.81155396, -1.7926025, 3.174011...","[-0.9682617, 1.3634338, -1.2138672, 1.079834, ...","[0.8603081, 0.61108774, 0.63551545, 0.5202152,..."
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"[[Ever, get, feeling, life, circles, roundabou...","[FLYNN, :, Hillary, Clinton, ,, Big, Woman, Ca...","[[0.73968506, 0.8388672, 0.08306885, 1.4943848...","[0.4741211, 0.40234375, -0.06982422, 0.9680176...","[0.2092796, 0.64176804, 0.2937841, 0.39681545,..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,"[[Why, Truth, Might, Get, You, Fired, October,...","[Why, Truth, Might, Get, You, Fired]","[[2.2545166, 1.6221542, -0.82055664, 2.9534912...","[1.0439453, 0.36621094, -0.0031738281, 1.10815...","[0.68639094, 0.4072204, 0.3529391, 0.28169176,..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,"[[Videos, 15, Civilians, Killed, In, Single, U...","[15, Civilians, Killed, In, Single, US, Airstr...","[[1.2223511, 1.1791992, -0.19897461, 2.0112305...","[1.4941406, -0.022949219, -0.5058594, 0.706542...","[0.84107816, 0.48234972, 0.09423115, 0.2498293..."
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,"[[Print, An, Iranian, woman, sentenced, six, y...","[Iranian, woman, jailed, fictional, unpublishe...","[[0.7753906, 0.27355957, 0.65734863, 0.1546020...","[0.90808105, 0.003540039, 0.7585449, 0.2471923...","[0.78048277, 0.64627194, 0.5618665, 0.7311593,..."
