In [1]:
import time
start_time = time.time()

In [2]:
# Importing the Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import random
import re

In [3]:
# Reading the files from the folder

fake_news_df = pd.read_csv('Fake.csv')
true_news_df = pd.read_csv('True.csv')

In [4]:
# Check the random instances of the data

display(fake_news_df.sample(2), true_news_df.sample(2))

Unnamed: 0,title,text,subject,date
8271,Superstar Marc Anthony Rocks Madison Square G...,If Donald Trump had been at Madison Square Gar...,News,"February 7, 2016"
22802,Liberal Imperium: Quigley’s Anglo-American Est...,Jay Dyer 21st Century WireThis is an addendum ...,Middle-east,"August 25, 2017"


Unnamed: 0,title,text,subject,date
21053,Kenya to hold new presidential vote on Oct. 17...,NAIROBI (Reuters) - Kenya will hold a new pres...,worldnews,"September 4, 2017"
6335,U.S. House clears way for vote to start Obamac...,WASHINGTON (Reuters) - A majority of the U.S. ...,politicsNews,"January 13, 2017"


## Data Cleaning Functions

In [5]:
## Data Cleaning  ###

# Remove the HTML text/phases from the data
def remove_html(text):
    new_text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", text)
    return(new_text)

# Count the lenght of the string
def len_text(text):
    text_len = len(text.split())
    return(text_len)
    
# Remove White Spaces
def remove_white_space(text):
    text = re.sub("^\s+|\s+$", "", text, flags=re.UNICODE) # Remove spaces both in beginining and in the end of a string
    text = " ".join(re.split("\s+", text, flags=re.UNICODE)) # Remove spaces from duplicate spaces
    return(text)

## Data Analysis (Before Data Cleaning)

In [6]:
display(fake_news_df.sample(2), true_news_df.sample(2))

Unnamed: 0,title,text,subject,date
9564,HOUSE OF LIES: Pedophilia Allegations Levied A...,How should the gay community feel about the Tr...,politics,"Oct 30, 2017"
4101,GOP Lawmaker: ‘A Lady Needs To Be Told When S...,A Texas lawmaker just defended Donald Trump s ...,News,"October 21, 2016"


Unnamed: 0,title,text,subject,date
18751,IAEA says North Korea's rapid weapons progress...,SEOUL (Reuters) - The United Nations nuclear w...,worldnews,"September 29, 2017"
15197,Philippines' Duterte offers to host 'world sum...,MANILA (Reuters) - Philippine President Rodrig...,worldnews,"November 10, 2017"


##### FAKE News

In [7]:
display(set(fake_news_df['subject']), set(true_news_df['subject']))

{'Government News', 'Middle-east', 'News', 'US_News', 'left-news', 'politics'}

{'politicsNews', 'worldnews'}

In [8]:
# Get the length of each instance
fake_news_df['len_sent'] = fake_news_df['text'].apply(lambda x: len_text(x)) 

In [9]:
fake_news_df['subject'].value_counts()

News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: subject, dtype: int64

In [10]:
# Analyse the description of each group in the subject feature
fake_news_df.groupby(['subject']).describe()

Unnamed: 0_level_0,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent,len_sent
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
subject,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Government News,1570.0,387.404459,389.428324,0.0,155.0,304.5,506.0,4547.0
Middle-east,778.0,772.548843,1040.851331,24.0,182.25,351.5,919.5,8135.0
News,9050.0,441.253812,152.060179,36.0,344.0,410.0,506.0,3909.0
US_News,783.0,780.527458,1049.778528,24.0,183.0,355.0,927.5,8135.0
left-news,4459.0,392.736264,363.389015,0.0,186.5,318.0,509.5,7033.0
politics,6841.0,346.752083,369.246542,0.0,120.0,276.0,462.0,7033.0


#### Conclusion for Statistical analysis on Fake News Dataframe

- Maximum instances of possess by "News" with 9050 instances, followed by "politics" [6841], "left-news" [4459], "Govt News" [1570], "US_news" [783], and "Middle-east" [778]
- The news corpus is largely aligned towards the "left-news" and "politics" 
- The minimum number of text in Middle-east, News and US_News starts from 24, 36 and 24, respectively.
- There are various instances in Govt News, left-news and politics whether the news is empty

In [None]:
random.seed(123)
fake_news_df[fake_news_df['subject']=='politics']['text'].sample(2, random_state=123).to_list()

## Data Analysis (After Data Cleaning)

In [None]:
fake_news_df.head(1)

In [None]:
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_html(x))
fake_news_df['text'] = fake_news_df['text'].apply(lambda x: remove_white_space(x))
fake_news_df['len_sent'] = fake_news_df['text'].apply(lambda x: len_text(x)) 

In [None]:
fake_news_df.sample(2)

In [None]:
## Analysis on Sentence Length
fake_news_df.describe()

In [None]:
# Check the number of sentences below 10 word length
fake_news_df[fake_news_df['len_sent'] < 21].sort_values(by="len_sent",ascending=False)

In [None]:
fake_news_df = fake_news_df[fake_news_df['len_sent'] > 20].reset_index(drop=True)
display(fake_news_df.shape, fake_news_df.sample(2))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))