In [0]:
# Importing all Libraries

import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [0]:
# Loading the dataset

df = pd.read_csv('/content/drive/My Drive/fake.csv')
df.shape

(12999, 20)

In [0]:
df.head()

Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
0,6a175f46bcd24d39b3e962ad0f29936721db70db,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,25689.0,Muslims BUSTED: They Stole Millions In Gov’t B...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
1,2bdc29d12605ef9cf3f09f9875040a7113be5d5b,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,25689.0,Re: Why Did Attorney General Loretta Lynch Ple...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
2,c70e149fdd53de5e61c29281100b9de0ed268bc3,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,25689.0,BREAKING: Weiner Cooperating With FBI On Hilla...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
3,7cf7c15731ac2a116dd7f629bd57ea468ed70284,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,25689.0,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
4,0206b54719c7e241ffe0ad4315b808290dbe6c0f,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,25689.0,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias


In [0]:
#Data Pre-Processing(Treating Symbols and Stopwords Present in Data) using nltk and re

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') #These Symbols will be replaced with Space
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')         #These Symbols will be Removed from Text
STOPWORDS = set(stopwords.words('english')) 

In [0]:
#Text Cleaning Functions

def string_form(value):
    return str(value)

def clean_text(text):
   
    text = BeautifulSoup(text, "lxml").text
    text = text.lower()  #LowerCase Text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) #Replace Certain Symbols by Space in Text
    text = BAD_SYMBOLS_RE.sub('', text) #Delete Certain Symbols from Text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) #Remove Stopwords from Text
    return text

In [0]:
# Text Pre-processing and cleaning operations

X_data = df['title'] + df['text']
X_data = X_data.apply(string_form)
X_data = X_data.apply(clean_text)

In [0]:
# Let us Remove all nan values

X_data_final = []

for e in range(len(X_data)):
  if(X_data[e] != 'nan'):
    X_data_final.append(X_data[e])

In [0]:
import numpy as np

X_data_final = np.array(X_data_final)
X_data_final.shape

(12273,)

In [0]:
# Further Cleaning Using Spacy

nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [0]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in X_data_final)

In [0]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 3.38 mins


In [0]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(12028, 1)

In [0]:
# Now we have 12028 samples meaning there's a good chance we will generate good useful vectors

In [0]:
from gensim.models.phrases import Phrases, Phraser

In [0]:
sent = [row.split() for row in df_clean['clean']]
sent

In [0]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 09:52:31: collecting all words and their counts
INFO - 09:52:31: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 09:52:37: PROGRESS: at sentence #10000, processed 3447822 words and 1893403 word types
INFO - 09:52:39: collected 2180496 word types from a corpus of 4146220 words (unigram + bigrams) and 12028 sentences
INFO - 09:52:39: using 2180496 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>


In [0]:
from nltk import bigrams

In [0]:
sentences = phrases[sent]



In [0]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)



158119

In [0]:
# In the above Corpus we have 158,119 distinct words after cleaning from 171,533

In [0]:
# Seeing the frequency of words(Top 10)

sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['not',
 'say',
 'people',
 'trump',
 'time',
 'know',
 's',
 'clinton',
 'like',
 'state']

In [0]:
import multiprocessing

from gensim.models import Word2Vec

In [0]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [0]:
# Word2vec model using Gensim(We can change min_count to any value for experimentation)
# Size Denotes word-embedding vector Dimension which in out case is 100

w2v_model = Word2Vec(min_count=1,
                     window=3,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [0]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 09:53:24: collecting all words and their counts
INFO - 09:53:24: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 09:53:39: PROGRESS: at sentence #10000, processed 3231681 words, keeping 133846 word types
INFO - 09:53:42: collected 158119 word types from a corpus of 3885672 raw words and 12028 sentences
INFO - 09:53:42: Loading a fresh vocabulary
INFO - 09:53:43: effective_min_count=1 retains 158119 unique words (100% of original 158119, drops 0)
INFO - 09:53:43: effective_min_count=1 leaves 3885672 word corpus (100% of original 3885672, drops 0)
INFO - 09:53:44: deleting the raw counts dictionary of 158119 items
INFO - 09:53:44: sample=6e-05 downsamples 1183 most-common words
INFO - 09:53:44: downsampling leaves estimated 2810286 word corpus (72.3% of prior 3885672)
INFO - 09:53:44: estimated required memory for 158119 words and 100 dimensions: 205554700 bytes
INFO - 09:53:44: resetting layer weights


Time to build vocab: 0.85 mins


In [0]:
# Building the word2vec model by training for 30 epochs

t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 09:54:22: training model with 3 workers on 158119 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=3
INFO - 09:54:23: EPOCH 1 - PROGRESS: at 5.65% examples, 119722 words/s, in_qsize 0, out_qsize 0
INFO - 09:54:24: EPOCH 1 - PROGRESS: at 10.70% examples, 125377 words/s, in_qsize 0, out_qsize 0
INFO - 09:54:25: EPOCH 1 - PROGRESS: at 14.10% examples, 126250 words/s, in_qsize 0, out_qsize 0
INFO - 09:54:27: EPOCH 1 - PROGRESS: at 20.95% examples, 127920 words/s, in_qsize 0, out_qsize 0
INFO - 09:54:28: EPOCH 1 - PROGRESS: at 25.89% examples, 127395 words/s, in_qsize 0, out_qsize 0
INFO - 09:54:29: EPOCH 1 - PROGRESS: at 30.68% examples, 127427 words/s, in_qsize 0, out_qsize 0
INFO - 09:54:30: EPOCH 1 - PROGRESS: at 33.71% examples, 127208 words/s, in_qsize 0, out_qsize 0
INFO - 09:54:31: EPOCH 1 - PROGRESS: at 38.45% examples, 126634 words/s, in_qsize 0, out_qsize 0
INFO - 09:54:32: EPOCH 1 - PROGRESS: at 44.43% examples, 126799 words/s, in_qsize 0, out_

Time to train the model: 11.0 mins


In [0]:
# Save the model
w2v_model.wv.save_word2vec_format('w2v_model.bin')

INFO - 10:05:31: storing 158119x100 projection weights into w2v_model.bin
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
w2v_model.wv.save_word2vec_format('w2v_model.txt', binary=False)

INFO - 10:05:44: storing 158119x100 projection weights into w2v_model.txt
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# Total number of words vectorized
# Notice that this number is large because we kept words with frequency 1
words = list(w2v_model.wv.vocab)
len(words)

158119

# Most Similar Word Analysis

In [0]:
w2v_model.wv.most_similar(positive=["fake"])

INFO - 10:06:18: precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('id', 0.5883486270904541),
 ('ids', 0.5781641006469727),
 ('hoax', 0.5745216608047485),
 ('fabricate', 0.5445030927658081),
 ('whopper', 0.522573709487915),
 ('false_narrative', 0.519706666469574),
 ('false', 0.5131078958511353),
 ('crisis_actor', 0.4988062381744385),
 ('bogus', 0.4836941957473755),
 ('fake_news', 0.4749912619590759)]

In [0]:
w2v_model.wv.most_similar(positive=["freedom"])

  if np.issubdtype(vec.dtype, np.int):


[('liberty', 0.6756128072738647),
 ('democracy', 0.631916880607605),
 ('cherish', 0.60960853099823),
 ('freedom_speech', 0.5919955968856812),
 ('freedom_expression', 0.5844323635101318),
 ('right', 0.573498010635376),
 ('semblance', 0.5713706016540527),
 ('dignity', 0.5597898960113525),
 ('respect', 0.5502725839614868),
 ('prosperity', 0.5439368486404419)]

In [0]:
w2v_model.wv.most_similar(positive=["attack"],topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('target', 0.6575837731361389),
 ('assault', 0.6551152467727661),
 ('terrorist_attack', 0.6341935396194458),
 ('claim', 0.6250004768371582),
 ('strike', 0.6227885484695435),
 ('kill', 0.5861219763755798),
 ('indiscriminate', 0.5814937353134155),
 ('force', 0.5803516507148743),
 ('launch', 0.5773041844367981),
 ('killing', 0.5768757462501526)]

In [0]:
w2v_model.wv.most_similar(positive=["bomb"],topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('detonate', 0.6688288450241089),
 ('bombing', 0.6180896759033203),
 ('munition', 0.5712682008743286),
 ('bombardment', 0.5458507537841797),
 ('civilian', 0.5426082611083984),
 ('attack', 0.5399023294448853),
 ('gingerly', 0.5328900814056396),
 ('atomic_bomb', 0.5309829711914062),
 ('governmentsnot', 0.5238995552062988),
 ('neardefenseless', 0.5224360227584839)]

In [0]:
w2v_model.wv.most_similar(positive=["president"])

  if np.issubdtype(vec.dtype, np.int):


[('elect', 0.7453252077102661),
 ('obama', 0.7041237354278564),
 ('white_house', 0.6970391273498535),
 ('presidency', 0.6857277154922485),
 ('trump', 0.6828287243843079),
 ('administration', 0.6744243502616882),
 ('elect_president', 0.6497210264205933),
 ('hillary_clinton', 0.6286641955375671),
 ('office', 0.6267598867416382),
 ('presidentelect_trump', 0.6243784427642822)]

In [0]:
w2v_model.wv.most_similar(positive=["terror"],topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('terrorism', 0.6181561946868896),
 ('terrorist', 0.6153697967529297),
 ('islamism', 0.5353000164031982),
 ('kla', 0.5323553681373596),
 ('extremism', 0.5294589996337891),
 ('terror_group', 0.529383659362793),
 ('terror_attack', 0.5037004351615906),
 ('islamic', 0.4970577657222748),
 ('jihadist', 0.4927752912044525),
 ('terrorist_attack', 0.4899193346500397)]

In [0]:
# We can see from above that word vector learnt are good estimates for understanding relation between words