# Fake News Classification - Text Pre-Processing

# Imports and Constants

In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk import FreqDist
import re
import string
from nltk.corpus import stopwords

In [2]:
DATA_PATH = '../data/'
CLEAN_DATA_FILE_NAME = 'news_dataset_cleaned.csv'
SAVE_FILE = False
PRE_PROCESSED_DATA_FILE_NAME = 'news_dataset_pre_processed.csv'

# Load Data

In [3]:
df = pd.read_csv(DATA_PATH + CLEAN_DATA_FILE_NAME)

In [4]:
df.head()

Unnamed: 0,title,text,subject,date,label,title_len,text_len,caps_in_title,norm_caps_in_title,caps_in_text,norm_caps_in_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,fake,79,2893,11,0.139241,138,0.047701
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,fake,69,1898,8,0.115942,88,0.046365
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,fake,90,3597,15,0.166667,308,0.085627
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,fake,78,2774,19,0.24359,123,0.04434
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,fake,70,2346,11,0.157143,63,0.026854


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38638 entries, 0 to 38637
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               38638 non-null  object 
 1   text                38638 non-null  object 
 2   subject             38638 non-null  object 
 3   date                38638 non-null  object 
 4   label               38638 non-null  object 
 5   title_len           38638 non-null  int64  
 6   text_len            38638 non-null  int64  
 7   caps_in_title       38638 non-null  int64  
 8   norm_caps_in_title  38638 non-null  float64
 9   caps_in_text        38638 non-null  int64  
 10  norm_caps_in_text   38638 non-null  float64
dtypes: float64(2), int64(4), object(5)
memory usage: 3.2+ MB


# Clean Title and Text Data

During the initial data cleaning process, I noticed that some of the news text did not have a space after a period that indicates the end of a sentence.  Because of that, tokenization cannot be done be a simple split on a space.

I want to look at a rough token frequency distribution of the text before doing any pre-processing.

## Helper Functions

In [6]:
def concat_lists_of_strings(df, column):
    """Concatenate a series of lists of strings from a column in a dataframe"""
    return [x for list_ in df[column].values for x in list_]

In [7]:
def find_strings(string_, regex):
    """Find and Return a list of URLs in the input string"""
    list_ = re.findall(regex, string_)
    return [s[0] for s in list_]

In [8]:
def freq_dist_of_col(df, col):
    """Return a Frequency Distribution of a column"""
    corpus_tokens = concat_lists_of_strings(df, col)
    corpus_freq_dist = FreqDist(corpus_tokens)
    print(f'The number of unique tokens in the corpus is {len(corpus_freq_dist)}')
    return corpus_freq_dist

In [9]:
def review_freq_dis(df, col, n):
    """
    Create a Frequency Distribution of a column of a dataframe and display
    the n most common tokens.
    """
    corpus_freq_dist = freq_dist_of_col(df, col)
    display(corpus_freq_dist.most_common(n))

In [10]:
def remove_punctuation(word_list, punctuation_list):
    """Remove punctuation tokens from a list of tokens"""
    return [w for w in word_list if w not in punctuation_list]

In [11]:
def remove_single_characters(word_list, exception_list):
    """Remove all the single characters, except those on the exception list"""
    return [w for w in word_list if (len(w) > 1 or w in exception_list)]

In [12]:
def remove_words(word_list, words_to_remove):
    """Remove all the words in the words_to_remove list from the words_list"""
    return [w for w in word_list if w not in words_to_remove]

## Rough look at token frequency distribution

In [13]:
tknzr = RegexpTokenizer(r'\w+|\$[\d\.]+|\([@\w\d]+\)')

In [14]:
df['text_tokens'] = df['text'].apply(tknzr.tokenize)

In [15]:
corpus_freq_dist = freq_dist_of_col(df, 'text_tokens')

The number of unique tokens in the corpus is 152402


In [16]:
corpus_freq_dist.most_common(150)

[('the', 771208),
 ('to', 457849),
 ('of', 372801),
 ('a', 339364),
 ('and', 336802),
 ('in', 282041),
 ('s', 198810),
 ('that', 195661),
 ('on', 162850),
 ('for', 144241),
 ('is', 138935),
 ('said', 122504),
 ('Trump', 121295),
 ('with', 98710),
 ('The', 98576),
 ('was', 96440),
 ('he', 95085),
 ('it', 88450),
 ('as', 82843),
 ('his', 80861),
 ('by', 79953),
 ('has', 78027),
 ('be', 71600),
 ('have', 70857),
 ('not', 68321),
 ('from', 67728),
 ('are', 60493),
 ('at', 59290),
 ('who', 58081),
 ('an', 56078),
 ('I', 51915),
 ('this', 50639),
 ('U', 50227),
 ('would', 49247),
 ('S', 47546),
 ('they', 47357),
 ('t', 44464),
 ('will', 42664),
 ('about', 41975),
 ('had', 40196),
 ('their', 38467),
 ('been', 36588),
 ('but', 36488),
 ('people', 34367),
 ('were', 34078),
 ('which', 33115),
 ('or', 32819),
 ('we', 32590),
 ('more', 31681),
 ('you', 31398),
 ('President', 31135),
 ('out', 29155),
 ('her', 28292),
 ('after', 28290),
 ('one', 27682),
 ('all', 26913),
 ('its', 26839),
 ('also', 26

How many tokens were only used once?

In [17]:
len([w for w in corpus_freq_dist.most_common() if w[1] == 1])

67886

How many tokens were only used 5 or less times?

In [18]:
len([w for w in corpus_freq_dist.most_common() if w[1] <= 5])

107176

At the top of the frequency distribution, the usual stop words are present, along with with words associated with politics or the names of political figures, institutions or countries.

The amount of words that are used only once or 5 or less times is relatively small given the size of the corpus.

## Investigate URLs in the text

First I want to see what URLs are present in the *text* column

In [19]:
URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

In [20]:
df['text_urls'] = df['text'].apply(lambda x: find_strings(x, URL_REGEX))

In [21]:
urls_in_text = concat_lists_of_strings(df, 'text_urls')

In [22]:
urls_in_text

['pic.twitter.com/4FPAe2KypA',
 'pic.twitter.com/XtZW5PdU2b',
 'pic.twitter.com/T2NY2psHCR',
 'https://t.co/zcbyc4Wp5b',
 'pic.twitter.com/fWfXsZupxy',
 '2017pic.twitter.com/ymsOBLjfxU',
 'pic.twitter.com/dWr5k8ZEZV',
 'pic.twitter.com/ulCFddhkdy',
 'https://t.co/Fg7VacxRtJ',
 'pic.twitter.com/5gEMcjQTbH',
 'https://t.co/zrWpyMXRcz',
 'pic.twitter.com/wiQSQNNzw0',
 'https://t.co/dkhw0AlHB4',
 'pic.twitter.com/oaZDT126B3',
 'https://t.co/ayBlGmk65Z',
 'pic.twitter.com/Z7dmyQ5smy',
 'pic.twitter.com/8TKtrMqRa1',
 'pic.twitter.com/hIxs3DciO8',
 'pic.twitter.com/E5bmcI83mU',
 'pic.twitter.com/a45En9Jwys',
 'pic.twitter.com/yLCBmhpNvG',
 'pic.twitter.com/3vMZUTEylx',
 'https://t.co/XrOvu32EV8',
 'pic.twitter.com/eMP9UX1bM8',
 'pic.twitter.com/XViyKFQCET',
 'https://t.co/HFYJRkefJ1',
 'https://t.co/65FhbQHuV4',
 'https://t.co/7lHYkIloyz',
 'https://t.co/g8SwgAKtfH',
 'https://t.co/9lCqpYujKN',
 'pic.twitter.com/NXEX9rGBgu',
 'pic.twitter.com/QePW9FtbSh',
 'pic.twitter.com/mUbKCIWGxB',
 'pic.

In [23]:
len(urls_in_text)

9648

The URLs look to all be links to Twitter images, probably profile photos

Are there any that are frequently used?

In [24]:
url_freq_dist = FreqDist(urls_in_text)

In [25]:
url_freq_dist.most_common(150)

[('bit.ly/2jBh4LU', 98),
 ('bit.ly/2jpEXYR', 98),
 ('connect.facebook.net/en_US/sdk.js#xfbml=1&#038;version=v2.3', 73),
 ('connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.3', 19),
 ('connect.facebook.net/en_GB/sdk.js#xfbml=1&#038;version=v2.3', 19),
 ('www.1100kfnx.com.LISTEN', 10),
 ('https://t.co/6OZtrfIwim', 9),
 ('pic.twitter.com/WYUnHjjUjg', 8),
 ('https://connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.11', 8),
 ('www.reuters.com/trump-effect', 8),
 ('https://connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.10', 7),
 ('https://t.co/4nxLipafWO', 7),
 ('connect.facebook.net/en_US/sdk.js#xfbml=1&amp;version=v2.3', 7),
 ('https://t.co/ifeSBlSZW7', 6),
 ('connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.10', 5),
 ('pic.twitter.com/QfefM8X2cW', 5),
 ('https://t.co/oaH1b92PFS', 5),
 ('pic.twitter.com/WIUTLOS4XD', 4),
 ('connect.facebook.net/en_GB/sdk.js#xfbml=1&version=v2.3', 4),
 ('https://t.co/hLWri23Lx7', 4),
 ('https://t.co/oLueAHCaUe', 4),
 ('pic.twitter.com/EHkQf

The two bit.ly links seem to have some significance to this dataset, but they would lose importance the more time that passes from the creation of a model.  I think it would be of more potential significance if there are links in a story rather than what that link is, so I will convert all the urls to the placeholder *{link}*.

In [26]:
df['clean_text'] = df['text'].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

## Investigate URLs in title

Are there any URLs in the title field?

In [27]:
df['title_urls'] = df['title'].apply(lambda x: find_strings(x, URL_REGEX))

In [28]:
urls_in_titles = concat_lists_of_strings(df, 'title_urls')

In [29]:
urls_in_titles

[]

There are no URLs in the title field

## Investigate Twitter handles in Text

In [30]:
TWITTER_HANDLE_REGEX = r'(?<=^|(?<=[^\w]))(@\w{1,15})\b'

In [31]:
df['twitter_handles'] = df['clean_text'].apply(lambda x: re.findall(TWITTER_HANDLE_REGEX, x))

In [32]:
twitter_handles = concat_lists_of_strings(df, 'twitter_handles')

In [33]:
twitter_freq_dist = FreqDist(twitter_handles)

In [34]:
twitter_freq_dist.most_common(50)

[('@realDonaldTrump', 2997),
 ('@POTUS', 345),
 ('@21WIRE', 283),
 ('@FoxNews', 233),
 ('@HillaryClinton', 198),
 ('@seanhannity', 176),
 ('@CNN', 170),
 ('@nytimes', 126),
 ('@foxandfriends', 101),
 ('@elizabethforma', 98),
 ('@NBCNews', 93),
 ('@JordanUhl', 87),
 ('@tonyposnanski', 86),
 ('@ABC', 81),
 ('@AnnCoulter', 71),
 ('@bessbell', 69),
 ('@PressSec', 66),
 ('@FLOTUS', 65),
 ('@IvankaTrump', 59),
 ('@WalshFreedom', 53),
 ('@WhiteHouse', 52),
 ('@BernieSanders', 52),
 ('@KellyannePolls', 51),
 ('@PrisonPlanet', 49),
 ('@BraddJaffy', 49),
 ('@SpeakerRyan', 48),
 ('@realdonaldtrump', 48),
 ('@DonaldJTrumpJr', 47),
 ('@marcorubio', 46),
 ('@SarahPalinUSA', 45),
 ('@ABCPolitics', 44),
 ('@BarackObama', 43),
 ('@CNNPolitics', 39),
 ('@EricTrump', 39),
 ('@MSNBC', 39),
 ('@Morning_Joe', 39),
 ('@joshdcaplan', 39),
 ('@GOP', 38),
 ('@CBSNews', 38),
 ('@JackPosobiec', 38),
 ('@Cernovich', 37),
 ('@kylegriffin1', 36),
 ('@mike_pence', 36),
 ('@MatthewDicks', 36),
 ('@tedcruz', 36),
 ('@w

In [35]:
# total number of Twitter handles
len(twitter_handles)

22224

In [36]:
# number of unique Twitter handles (case sensitive)
len(twitter_freq_dist)

8162

Not surprisingly @realDonaldTrump is the most frequently mentioned Twitter handle given that he is a prolific Twitter user and also the President or a presidential candidate during most of the time frame of the dataset.

While there may be some value in keeping the actual Twitter handles, I think there would be more value in just mentioning that a Twitter handle was used.  Time permitting these two possibilities could be A/B tested.

At this point I will replace all the Twitter handles with @twitter-handle

In [37]:
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(TWITTER_HANDLE_REGEX, '@twitter-handle', x))

## Capitalization

Because words with all caps are an import way that emphasis is made online, I want to keep words that are in all caps while making all the letters lower case.  Words of length of one will be made lower case though since they are likely *A or I* which can be made lowercase without losing much emphasis.

In [38]:
def lower_unless_all_caps(string_):
    """
    Make all words in the input string lowercase unless that 
    word is in all caps
    """
    words = string_.split()
    processed_words = [w.lower() if not (w.isupper() and len(w) > 1) else w for w in words]
    return ' '.join(processed_words)

In [39]:
df['clean_text'] = df['clean_text'].apply(lower_unless_all_caps)

In [40]:
df['clean_title'] = df['title'].apply(lower_unless_all_caps)

## Numbers

Numbers do not seem likely to indicate Fake news, although certain dates or numbers may.  The only date/number I've come across that may have significant meaning is 9/11.  I will change it to *nine-eleven* so that numbers can more easily be removed.

I will replace the numbers with a space because some of the sentences run together and end with a number.  Replacing the number with a space will split the sentences.

In [41]:
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [42]:
df['clean_title'] = df['clean_title'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [43]:
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'\d+', ' ', x))

In [44]:
df['clean_title'] = df['clean_title'].apply(lambda x: re.sub(r'\d+', ' ', x))

## Rough Look at the tokens from the current clean_text

In [45]:
df['clean_text_tokens'] = df['clean_text'].apply(word_tokenize)

In [46]:
review_freq_dis(df, 'clean_text_tokens', 150)

The number of unique tokens in the corpus is 202057


[('the', 859816),
 (',', 779608),
 ('.', 545106),
 ('to', 458893),
 ('of', 373562),
 ('a', 352372),
 ('and', 345512),
 ('in', 297919),
 ('that', 203732),
 ('s', 199133),
 ('on', 166846),
 ('for', 147194),
 ('is', 139893),
 ('said', 120757),
 ('trump', 117656),
 ('he', 117174),
 ('it', 110523),
 ('with', 100695),
 ('was', 96714),
 ('as', 88045),
 ('his', 84136),
 ('by', 80892),
 ('has', 78136),
 ('not', 72446),
 ('be', 71204),
 ('have', 71038),
 ('’', 70279),
 ('from', 68567),
 (')', 68006),
 ('(', 67765),
 ('this', 62791),
 ('at', 62594),
 ('are', 61004),
 ('who', 58864),
 ('an', 57936),
 ('they', 57181),
 ('“', 53766),
 ('”', 53489),
 ('but', 50911),
 ('we', 50792),
 (':', 49644),
 ('would', 49324),
 ('i', 46700),
 ('president', 45310),
 ('U.S.', 43289),
 ('will', 43158),
 ('about', 42407),
 ('had', 40272),
 ('t', 40223),
 ('their', 39204),
 ('you', 37849),
 ('been', 36558),
 ('people', 35600),
 ('-', 34195),
 ('were', 34161),
 ('which', 33412),
 ('or', 33116),
 ('more', 32884),
 ('af

nltk's word_tokenize has stripped the @ simple off of @twitter, so now the word twitter has been combined with my placeholder for a twitter handle.  Ideally a customized tokenizer would be build that combined the performance of word_tokenize and the ability to keep twitter handles of TweetTokenizer.

I also see that because word_tokenize strips () off of words, (reuters) has been changed to reuters and combined with references to reuters.  This is a problem that will need to be addressed because almost all the *True* news stories have (Reuters) at their beginning and a ML model would merely learn that as how to distinguish Fake vs True, which would overfit potential models to this dataset.

## Remove (reuters) from news stories.

keeping (reuters) in the news text will create an overfit model when applying it to data outside the current dataset.

In [47]:
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'\(reuters\)', ' ', x))

## Update Tokens

In [48]:
df['clean_text_tokens'] = df['clean_text'].apply(word_tokenize)

In [49]:
review_freq_dis(df, 'clean_text_tokens', 150)

The number of unique tokens in the corpus is 202057


[('the', 859816),
 (',', 779608),
 ('.', 545106),
 ('to', 458893),
 ('of', 373562),
 ('a', 352372),
 ('and', 345512),
 ('in', 297919),
 ('that', 203732),
 ('s', 199133),
 ('on', 166846),
 ('for', 147194),
 ('is', 139893),
 ('said', 120757),
 ('trump', 117656),
 ('he', 117174),
 ('it', 110523),
 ('with', 100695),
 ('was', 96714),
 ('as', 88045),
 ('his', 84136),
 ('by', 80892),
 ('has', 78136),
 ('not', 72446),
 ('be', 71204),
 ('have', 71038),
 ('’', 70279),
 ('from', 68567),
 ('this', 62791),
 ('at', 62594),
 ('are', 61004),
 ('who', 58864),
 ('an', 57936),
 ('they', 57181),
 ('“', 53766),
 ('”', 53489),
 ('but', 50911),
 ('we', 50792),
 (':', 49644),
 ('would', 49324),
 (')', 46974),
 ('(', 46733),
 ('i', 46700),
 ('president', 45310),
 ('U.S.', 43289),
 ('will', 43158),
 ('about', 42407),
 ('had', 40272),
 ('t', 40223),
 ('their', 39204),
 ('you', 37849),
 ('been', 36558),
 ('people', 35600),
 ('-', 34195),
 ('were', 34161),
 ('which', 33412),
 ('or', 33116),
 ('more', 32884),
 ('af

## Remove Punctuation and Single Letter Tokens from Text

I will remove all of the Punctuation tokens except for the exclamation point, because it seems like it may be an indicator of Fake news.  I will also remove all the single characters except for *i*.

In [50]:
df['clean_text_tokens'] = df['clean_text_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [51]:
review_freq_dis(df, 'clean_text_tokens', 150)

The number of unique tokens in the corpus is 201966


[('the', 859816),
 ('to', 458893),
 ('of', 373562),
 ('and', 345512),
 ('in', 297919),
 ('that', 203732),
 ('on', 166846),
 ('for', 147194),
 ('is', 139893),
 ('said', 120757),
 ('trump', 117656),
 ('he', 117174),
 ('it', 110523),
 ('with', 100695),
 ('was', 96714),
 ('as', 88045),
 ('his', 84136),
 ('by', 80892),
 ('has', 78136),
 ('not', 72446),
 ('be', 71204),
 ('have', 71038),
 ('from', 68567),
 ('this', 62791),
 ('at', 62594),
 ('are', 61004),
 ('who', 58864),
 ('an', 57936),
 ('they', 57181),
 ('but', 50911),
 ('we', 50792),
 ('would', 49324),
 ('i', 46700),
 ('president', 45310),
 ('U.S.', 43289),
 ('will', 43158),
 ('about', 42407),
 ('had', 40272),
 ('their', 39204),
 ('you', 37849),
 ('been', 36558),
 ('people', 35600),
 ('were', 34161),
 ('which', 33412),
 ('or', 33116),
 ('more', 32884),
 ('after', 31621),
 ('she', 30880),
 ('one', 30119),
 ('if', 30084),
 ('her', 29018),
 ('state', 28335),
 ('out', 28326),
 ('all', 27997),
 ('what', 27418),
 ('its', 27295),
 ('also', 26967

## Tokenize Clean Title

In [52]:
df['clean_title_tokens'] = df['clean_title'].apply(word_tokenize)

In [53]:
df.iloc[1000:1010]['clean_title_tokens']

1000    [stunning, new, poll, reveals, global, opinion...
1001    [former, GOP, rep, throws, support, behind, ob...
1002    [trump, moronically, claims, entire, russia, i...
1003    [WATCH, :, it, just, hit, a, trump, supporter,...
1004    [republicans, just, added, what, they, hated, ...
1005    [the, numbers, are, in, and, jon, ossoff, losi...
1006    [GOP, senator, lashes, into, kellyanne, conway...
1007    [cops, in, republican, senate, office, violent...
1008    [trump, is, giddy, about, his, upcoming, meeti...
1009    [catholic, sister, rips, ‘, pro-life, ’, repub...
Name: clean_title_tokens, dtype: object

In [54]:
review_freq_dis(df, 'clean_title_tokens', 150)

The number of unique tokens in the corpus is 31553


[('to', 14060),
 ('trump', 11939),
 (':', 10879),
 ('’', 10184),
 (',', 8742),
 ('in', 7536),
 ('of', 6083),
 ('for', 6043),
 ('on', 5630),
 ('the', 4638),
 ('s', 4564),
 ('U.S.', 4157),
 ("'s", 4120),
 ('(', 4101),
 (')', 4101),
 ('VIDEO', 3992),
 ('says', 3347),
 ('a', 3249),
 ('with', 3134),
 ('”', 3080),
 ('[', 3035),
 (']', 3034),
 ('and', 2942),
 ('“', 2847),
 ('‘', 2838),
 ("'", 2619),
 ('is', 2263),
 ('!', 2157),
 ('after', 2114),
 ('video', 1957),
 ('obama', 1807),
 ('house', 1794),
 ('at', 1774),
 ('as', 1763),
 ('over', 1694),
 ('from', 1551),
 ('he', 1533),
 ('his', 1528),
 ('by', 1484),
 ('about', 1455),
 ('white', 1377),
 ('new', 1367),
 ('it', 1354),
 ('will', 1312),
 ('not', 1309),
 ('WATCH', 1267),
 ('clinton', 1202),
 ('?', 1174),
 ('russia', 1142),
 ('just', 1133),
 ('be', 1122),
 ('president', 1117),
 ('hillary', 1058),
 ('bill', 1027),
 ('t', 1005),
 ('republican', 992),
 ('north', 986),
 ('korea', 912),
 ('that', 890),
 ('this', 888),
 ('senate', 878),
 ('out', 87

## Remove Punctuation and Single Letter Tokens from Clean Title

In [55]:
df['clean_title_tokens'] = df['clean_title_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [56]:
review_freq_dis(df, 'clean_title_tokens', 150)

The number of unique tokens in the corpus is 31479


[('to', 14060),
 ('trump', 11939),
 ('in', 7536),
 ('of', 6083),
 ('for', 6043),
 ('on', 5630),
 ('the', 4638),
 ('U.S.', 4157),
 ("'s", 4120),
 ('VIDEO', 3992),
 ('says', 3347),
 ('with', 3134),
 ('and', 2942),
 ('is', 2263),
 ('!', 2157),
 ('after', 2114),
 ('video', 1957),
 ('obama', 1807),
 ('house', 1794),
 ('at', 1774),
 ('as', 1763),
 ('over', 1694),
 ('from', 1551),
 ('he', 1533),
 ('his', 1528),
 ('by', 1484),
 ('about', 1455),
 ('white', 1377),
 ('new', 1367),
 ('it', 1354),
 ('will', 1312),
 ('not', 1309),
 ('WATCH', 1267),
 ('clinton', 1202),
 ('russia', 1142),
 ('just', 1133),
 ('be', 1122),
 ('president', 1117),
 ('hillary', 1058),
 ('bill', 1027),
 ('republican', 992),
 ('north', 986),
 ('korea', 912),
 ('that', 890),
 ('this', 888),
 ('senate', 878),
 ('out', 876),
 ('state', 868),
 ('court', 863),
 ('china', 796),
 ('who', 788),
 ('him', 785),
 ('against', 782),
 ('up', 775),
 ('election', 763),
 ('you', 760),
 ('are', 758),
 ('has', 752),
 ('vote', 744),
 ('donald', 7

## Remove 's

While the fake news frequently or always didn't removed the apostrophe from 's, it doesn't look like that was done to the true news.  's will need to be removed so that it doesn't become an indicator of true news.

In [57]:
df['clean_title_tokens'] = df['clean_title_tokens'].apply(lambda x: remove_words(x, ["'s"]))
df['clean_text_tokens'] = df['clean_text_tokens'].apply(lambda x: remove_words(x, ["'s"]))

## Remove Stop Words

First, let's compare the most frequent words in the news titles and text with the NLTK English stopwords list.

In [58]:
stop_words = stopwords.words('english')

In [59]:
display(stop_words)

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

Get the most frequent words in the text

In [60]:
most_freq_clean_text = [x[0] for x in list(freq_dist_of_col(df, 'clean_text_tokens').most_common(150))]

The number of unique tokens in the corpus is 201965


In [61]:
most_freq_clean_text

['the',
 'to',
 'of',
 'and',
 'in',
 'that',
 'on',
 'for',
 'is',
 'said',
 'trump',
 'he',
 'it',
 'with',
 'was',
 'as',
 'his',
 'by',
 'has',
 'not',
 'be',
 'have',
 'from',
 'this',
 'at',
 'are',
 'who',
 'an',
 'they',
 'but',
 'we',
 'would',
 'i',
 'president',
 'U.S.',
 'will',
 'about',
 'had',
 'their',
 'you',
 'been',
 'people',
 'were',
 'which',
 'or',
 'more',
 'after',
 'she',
 'one',
 'if',
 'her',
 'state',
 'out',
 'all',
 'what',
 'its',
 'also',
 'when',
 'there',
 'new',
 'can',
 'donald',
 'up',
 'no',
 'house',
 'over',
 'government',
 'states',
 'republican',
 'clinton',
 'twitter-handle',
 'him',
 'obama',
 'than',
 'our',
 'some',
 'could',
 'so',
 'other',
 'united',
 'just',
 'told',
 'into',
 'white',
 'against',
 'do',
 'campaign',
 'because',
 'like',
 'last',
 'them',
 'party',
 'two',
 'any',
 'election',
 'now',
 'time',
 'only',
 'first',
 'former',
 'year',
 'how',
 'country',
 'news',
 'even',
 'should',
 'while',
 'did',
 'being',
 '!',
 'bef

In [62]:
def intersection(lst1, lst2):
    """Return the intersection of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value in temp] 
    return lst3

In [63]:
common_words = intersection(stop_words, most_freq_clean_text)

In [64]:
common_words

['i',
 'we',
 'our',
 'you',
 'he',
 'him',
 'his',
 'she',
 'her',
 'it',
 'its',
 'they',
 'them',
 'their',
 'what',
 'which',
 'who',
 'this',
 'that',
 'those',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'do',
 'did',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'into',
 'during',
 'before',
 'after',
 'to',
 'from',
 'up',
 'in',
 'out',
 'on',
 'over',
 'under',
 'there',
 'when',
 'where',
 'how',
 'all',
 'any',
 'more',
 'most',
 'other',
 'some',
 'no',
 'not',
 'only',
 'so',
 'than',
 'very',
 'can',
 'will',
 'just',
 'should',
 'now',
 're']

In [65]:
len(common_words)

82

In [66]:
len(stop_words)

179

There are 82 common words, 97 of the NLTK stopwords are not in the 150 most frequent words from the news text, and 68 of the most frequent words from the news text are not in the NLTK stopwords.

What are those words that aren't common to both lists?

In [67]:
def difference(lst1, lst2):
    """Return the difference of two lists"""

    temp = set(lst2) 
    lst3 = [value for value in lst1 if value not in temp] 
    return lst3

In [68]:
words_in_nltk_not_news = difference(stop_words, most_freq_clean_text)

In [69]:
words_in_nltk_not_news

['me',
 'my',
 'myself',
 'ours',
 'ourselves',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'himself',
 "she's",
 'hers',
 'herself',
 "it's",
 'itself',
 'theirs',
 'themselves',
 'whom',
 "that'll",
 'these',
 'am',
 'having',
 'does',
 'doing',
 'a',
 'until',
 'between',
 'through',
 'above',
 'below',
 'down',
 'off',
 'again',
 'further',
 'then',
 'once',
 'here',
 'why',
 'both',
 'each',
 'few',
 'such',
 'nor',
 'own',
 'same',
 'too',
 's',
 't',
 'don',
 "don't",
 "should've",
 'd',
 'll',
 'm',
 'o',
 've',
 'y',
 'ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't"]

In [70]:
words_in_news_not_nltk = difference(most_freq_clean_text, stop_words)

In [71]:
words_in_news_not_nltk

['said',
 'trump',
 'would',
 'president',
 'U.S.',
 'people',
 'one',
 'state',
 'also',
 'new',
 'donald',
 'house',
 'government',
 'states',
 'republican',
 'clinton',
 'twitter-handle',
 'obama',
 'could',
 'united',
 'told',
 'white',
 'campaign',
 'like',
 'last',
 'party',
 'two',
 'election',
 'time',
 'first',
 'former',
 'year',
 'country',
 'news',
 'even',
 '!',
 'many',
 'years',
 'security',
 'may',
 'say',
 'percent',
 'republicans',
 'national',
 'court',
 'since',
 'political',
 'made',
 'law',
 'get',
 'make',
 'presidential',
 'hillary',
 'going',
 'administration',
 'senate',
 'media',
 'police',
 'russia',
 'democratic',
 'north',
 'american',
 'support',
 'week',
 'bill',
 'back',
 'including',
 'vote']

Looking at the remaining frequent words from the news text, that are all very concentrated on political news.

Looking at the remaining words from the NLTK stopwords list, the most notable thing is that the news text must be very male because *she's, hers and herself* weren't common to both lists.

If I was doing sentiment analysis, I would want to remove words indicating sentiment from a stop words list, but Fake News isn't necessarily negative or positive.

The use of stopwords may be indicative of Fake or True news, so I will not take them out at this point and then compare results of models with or without stopwords removed.

# Save Data

In [72]:
if SAVE_FILE:
    df.to_csv(DATA_PATH + PRE_PROCESSED_DATA_FILE_NAME, index=False)