# Preliminary Steps

In [None]:

!pip install krovetzstemmer

Collecting krovetzstemmer
  Downloading KrovetzStemmer-0.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.9/112.9 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: krovetzstemmer
  Building wheel for krovetzstemmer (setup.py) ... [?25ldone
[?25h  Created wheel for krovetzstemmer: filename=KrovetzStemmer-0.8-cp311-cp311-linux_x86_64.whl size=377862 sha256=735a7f27274581f580e1955254fe63a009dbef34e89c7580e6e256c434817ccd
  Stored in directory: /root/.cache/pip/wheels/3a/c1/dd/0200a30b35de8aa1a7e25a5f59c75eb144058e23229597cade
Successfully built krovetzstemmer
Installing collected packages: krovetzstemmer
Successfully installed krovetzstemmer-0.8

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[3

## Import required libraries 

In [None]:
import numpy as np
import pandas as pd

# For text processing, cleaning
import contractions

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer, LancasterStemmer
from nltk.corpus import wordnet
from nltk.sentiment import SentimentIntensityAnalyzer

import spacy

from gensim.parsing.preprocessing import PorterStemmer

from krovetzstemmer import Stemmer

import re

from collections import Counter

from textblob import TextBlob 

from tqdm import tqdm
tqdm.pandas()

## Import Dataset

In [None]:
true = pd.read_csv('/work/20240407-150440/News _dataset/True.csv')
false = pd.read_csv('/work/20240407-150440/News _dataset/Fake.csv')

## View each dataset 

In [None]:
# view true dataset
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [None]:
# view false dataset
false.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
print(true.shape)
print(false.shape)

(21417, 6)
(23481, 6)


There will be some class imbalance, hence we will utilise tree-based algorithms like Decision Trees, Random Forests, and Gradient Boosting Machines when training the model, as all of these can handle class imbalance well.

## Combine both datasets into a single Dataframe for easy access

In [None]:
# Add a 'label' column indicating the truth value of each dataset 
# with '1' representing true 
# and '0' representing false 
true['label'] = 1
false['label'] = 0

# add another 'label_meaning' column to explain what the above boolean values mean
# this allows for better presentation, when performing visualisation latter 
true['label_meaning'] = True 
false['label_meaning'] = False

# Concatenate the two dataframes into a single dataframe
# easier to perform algorithmic analysis on 
combined_news = pd.concat([true, false], ignore_index=True)

# shuffle the rows 
# this prevents the first half of the dataset being all true values, and second half being false
# this allows for better presentation 
news = combined_news.sample(frac=1).reset_index(drop=True)

# view the combined dataset
news.head()

Unnamed: 0,title,text,subject,date,label,label_meaning
0,OBAMA LIED To Protect Hillary..New Wikileaks E...,Remember Combetta is Hillary s Oh Sh*t IT guy:...,politics,"Oct 25, 2016",0,False
1,This Cop Sees Black Lives Matter In A Way Tha...,The battle between supporters of Black Lives M...,News,"July 21, 2016",0,False
2,WHOA! DNC Releases Statement Suggesting Dallas...,"Of course tomorrow morning, the Democrat Party...",left-news,"Jul 8, 2016",0,False
3,Venezuela systematically abused foes in 2017 p...,CARACAS (Reuters) - Venezuela systematically ...,worldnews,"November 29, 2017",1,True
4,Democrats in Congress brace for new Iran nucle...,WASHINGTON (Reuters) - As Congress faces a pos...,worldnews,"September 20, 2017",1,True


## Displaying additional information about dataset

In [None]:
# Displaying all columns and their data types: 
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          44898 non-null  object
 1   text           44898 non-null  object
 2   subject        44898 non-null  object
 3   date           44898 non-null  object
 4   label          44898 non-null  int64 
 5   label_meaning  44898 non-null  bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 1.8+ MB


In [None]:
# Size of the dataframe 
news.shape

(44898, 6)

# Data Cleaning

## Check for null rows

In [None]:
# Check if any of the columns contains NULL value
news.isnull().sum(axis=0)

title            0
text             0
subject          0
date             0
label            0
label_meaning    0
dtype: int64

None of the columns had any NULL values, so there is no need to replace NULL with median text or title wordcount, or drop any rows with NULL values. 

# Text Preprocessing 

Relevant columns with textual content are 'title' and 'text'

('subject' is excluded, since it is a categorical variable)

The following steps will be thus be applied to both 'title' and text' columns

## 1. Convert all text to lowercase

Ensures that words with the same characters are treated as the same word, despite having different cases

E.g. "Apple" and "apple" are now considered the same
This makes it easier to compare words and identify patterns

In [None]:
news['title'] = news['title'].str.lower()
news['text'] = news['text'].str.lower()

# view the dataframe
news.head()

Unnamed: 0,title,text,subject,date,label,label_meaning
0,obama lied to protect hillary..new wikileaks e...,remember combetta is hillary s oh sh*t it guy:...,politics,"Oct 25, 2016",0,False
1,this cop sees black lives matter in a way tha...,the battle between supporters of black lives m...,News,"July 21, 2016",0,False
2,whoa! dnc releases statement suggesting dallas...,"of course tomorrow morning, the democrat party...",left-news,"Jul 8, 2016",0,False
3,venezuela systematically abused foes in 2017 p...,caracas (reuters) - venezuela systematically ...,worldnews,"November 29, 2017",1,True
4,democrats in congress brace for new iran nucle...,washington (reuters) - as congress faces a pos...,worldnews,"September 20, 2017",1,True


## 2. Contraction Splitting 

Breaks down contractions into their constituent parts
E.g.:
Original Text: "I can't believe it's raining." 
After Contraction Splitting: "I can not believe it is raining."  

By doing this, text becomes more explicit and easier for machines to understand

In [None]:
# define a function to remove contractions from text
def remove_contractions(text):
    return ' '.join([contractions.fix(word) for word in text.split()])

# for every row in each column 'title' and 'text'
# apply remove_contractions() to remove contractions
news['title'] = news['title'].progress_map(lambda x: remove_contractions(x))
news['text'] = news['text'].progress_map(lambda x: remove_contractions(x))

# view the dataframe
news.head()

100%|██████████| 44898/44898 [00:01<00:00, 36697.13it/s]
100%|██████████| 44898/44898 [00:35<00:00, 1248.01it/s]


Unnamed: 0,title,text,subject,date,label,label_meaning
0,obama lied to protect hillary..new wikileaks e...,remember combetta is hillary s oh sh*t it guy:...,politics,"Oct 25, 2016",0,False
1,this cop sees black lives matter in a way that...,the battle between supporters of black lives m...,News,"July 21, 2016",0,False
2,whoa! dnc releases statement suggesting dallas...,"of course tomorrow morning, the democrat party...",left-news,"Jul 8, 2016",0,False
3,venezuela systematically abused foes in 2017 p...,caracas (reuters) - venezuela systematically a...,worldnews,"November 29, 2017",1,True
4,democrats in congress brace for new iran nucle...,washington (reuters) - as congress faces a pos...,worldnews,"September 20, 2017",1,True


## 3. Remove punctuation 

Ensures that the text is cleaned and standardised

Punctuations are unnecessary characters that don't contribute significantly to the meaning of text 
Helps in creating cleaner and meaningful tokens

In [None]:
# define a function to remove punctuation 
def remove_punctuations(text):
    # Regular expression pattern to remove punctuation
    pattern = re.compile(r'[^\w\s]')
    cleaned_text = pattern.sub('', text)
    
    return cleaned_text

# for every row in each column 'title' and 'text'
# apply remove_punctuations() to remove contractions
news['title'] = news['title'].progress_map(lambda x: remove_punctuations(x))
news['text'] = news['text'].progress_map(lambda x: remove_punctuations(x))


# view dataframe
news.head()

100%|██████████| 44898/44898 [00:00<00:00, 254777.67it/s]
100%|██████████| 44898/44898 [00:02<00:00, 15452.49it/s]


Unnamed: 0,title,text,subject,date,label,label_meaning
0,obama lied to protect hillarynew wikileaks ema...,remember combetta is hillary s oh sht it guybr...,politics,"Oct 25, 2016",0,False
1,this cop sees black lives matter in a way that...,the battle between supporters of black lives m...,News,"July 21, 2016",0,False
2,whoa dnc releases statement suggesting dallas ...,of course tomorrow morning the democrat party ...,left-news,"Jul 8, 2016",0,False
3,venezuela systematically abused foes in 2017 p...,caracas reuters venezuela systematically abus...,worldnews,"November 29, 2017",1,True
4,democrats in congress brace for new iran nucle...,washington reuters as congress faces a possib...,worldnews,"September 20, 2017",1,True


## 4. Stemming 

In [None]:
news2 = news
news3 = news
news4 = news
news5 = news
news6 = news
news7 = news 

In [None]:
# Initialize NLTK Snowball Stemmer
snowball_stemmer = SnowballStemmer("english")

news2['title'] = news2['title'].apply(lambda x: ' '.join([snowball_stemmer.stem(word) for word in x.split()]))

# view the dataframe
news2.head()

Unnamed: 0,title,text,subject,date,label,label_meaning
0,obama lie to protect hillarynew wikileak email...,remember combetta is hillary s oh sht it guybr...,politics,"Oct 25, 2016",0,False
1,this cop see black live matter in a way that w...,the battle between supporters of black lives m...,News,"July 21, 2016",0,False
2,whoa dnc releas statement suggest dalla sniper...,of course tomorrow morning the democrat party ...,left-news,"Jul 8, 2016",0,False
3,venezuela systemat abus foe in 2017 protest ri...,caracas reuters venezuela systematically abus...,worldnews,"November 29, 2017",1,True
4,democrat in congress brace for new iran nuclea...,washington reuters as congress faces a possib...,worldnews,"September 20, 2017",1,True


In [None]:
lancaster_stemmer = LancasterStemmer()

news3['title'] = news3['title'].apply(lambda x: ' '.join([lancaster_stemmer.stem(word) for word in x.split()]))

news3.head()

Unnamed: 0,title,text,subject,date,label,label_meaning
0,obam lie to protect hillarynew wikileak email ...,remember combetta is hillary s oh sht it guybr...,politics,"Oct 25, 2016",0,False
1,thi cop see black liv mat in a way that wil ma...,the battle between supporters of black lives m...,News,"July 21, 2016",0,False
2,who dnc relea stat suggest dall snip and black...,of course tomorrow morning the democrat party ...,left-news,"Jul 8, 2016",0,False
3,venezuel system ab foe in 2017 protest right g...,caracas reuters venezuela systematically abus...,worldnews,"November 29, 2017",1,True
4,democr in congress brac for new ir nuclear fight,washington reuters as congress faces a possib...,worldnews,"September 20, 2017",1,True


In [None]:
!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")


# spacy lemmatisation
news5['title'] = news5['title'].progress_apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

news5.head()


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
100%|██████████| 44898/44898 [07:02<00:00, 106.22it/s]


Unnamed: 0,title,text,subject,date,label,label_meaning
0,obam lie to protect hillarynew wikileak email ...,remember combetta is hillary s oh sht it guybr...,politics,"Oct 25, 2016",0,False
1,thi cop see black liv mat in a way that wil ma...,the battle between supporters of black lives m...,News,"July 21, 2016",0,False
2,who dnc relea stat suggest dall snip and black...,of course tomorrow morning the democrat party ...,left-news,"Jul 8, 2016",0,False
3,venezuel system ab foe in 2017 protest right g...,caracas reuters venezuela systematically abus...,worldnews,"November 29, 2017",1,True
4,democr in congress brac for new ir nuclear fight,washington reuters as congress faces a possib...,worldnews,"September 20, 2017",1,True


In [None]:

!python -m textblob.download_corpora

# Initialize TextBlob's Porter stemmer
def textblob_porter_stemming(text):
    blob = TextBlob(text)
    return ' '.join([word.stem() for word in blob.words])

# Apply TextBlob's Porter Stemmer
news6['title'] = news6['title'].progress_map(textblob_porter_stemming)

news6.head()

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.
100%|██████████| 44898/44898 [00:17<00:00, 2517.66it/s]


Unnamed: 0,title,text,subject,date,label,label_meaning
0,obam lie to protect hillarynew wikileak email ...,remember combetta is hillary s oh sht it guybr...,politics,"Oct 25, 2016",0,False
1,thi cop see black liv mat in a way that wil ma...,the battle between supporters of black lives m...,News,"July 21, 2016",0,False
2,who dnc relea stat suggest dall snip and black...,of course tomorrow morning the democrat party ...,left-news,"Jul 8, 2016",0,False
3,venezuel system ab foe in 2017 protest right g...,caracas reuters venezuela systematically abus...,worldnews,"November 29, 2017",1,True
4,democr in congress brac for new ir nuclear fight,washington reuters as congress faces a possib...,worldnews,"September 20, 2017",1,True


In [None]:
porter_stemmer = PorterStemmer()

# Apply Gensim's Porter Stemmer
news7['title'] = news7['title'].progress_map(lambda x: ' '.join([porter_stemmer.stem(word) for word in x.split()]))

news7.head()

100%|██████████| 44898/44898 [00:01<00:00, 32823.40it/s]


Unnamed: 0,title,text,subject,date,label,label_meaning
0,obam lie to protect hillarynew wikileak email ...,remember combetta is hillary s oh sht it guybr...,politics,"Oct 25, 2016",0,False
1,thi cop see black liv mat in a wai that wil ma...,the battle between supporters of black lives m...,News,"July 21, 2016",0,False
2,who dnc relea stat suggest dall snip and black...,of course tomorrow morning the democrat party ...,left-news,"Jul 8, 2016",0,False
3,venezuel system ab foe in 2017 protest right g...,caracas reuters venezuela systematically abus...,worldnews,"November 29, 2017",1,True
4,democr in congress brac for new ir nuclear fight,washington reuters as congress faces a possib...,worldnews,"September 20, 2017",1,True


In [None]:
krovetz_stemmer = Stemmer()

news4['title'] = news4['title'].apply(lambda x: ' '.join([krovetz_stemmer.stem(word) for word in x.split()]))

news4.head()


Unnamed: 0,title,text,subject,date,label,label_meaning
0,obam lie to protect hillarynew wikileak email ...,remember combetta is hillary s oh sht it guybr...,politics,"Oct 25, 2016",0,False
1,thi cop see black liv mat in a wai that wil ma...,the battle between supporters of black lives m...,News,"July 21, 2016",0,False
2,who dnc relea stat suggest dall snip and black...,of course tomorrow morning the democrat party ...,left-news,"Jul 8, 2016",0,False
3,venezuel system ab foe in 2017 protest right g...,caracas reuters venezuela systematically abus...,worldnews,"November 29, 2017",1,True
4,democr in congress brac for new ir nuclear fight,washington reuters as congress faces a possib...,worldnews,"September 20, 2017",1,True


## 5. Lemmatisation 

Lemmatization is the process of converting a word to its base form.

Another process commonly used for this purpose is stemming.
The difference between stemming and lemmatization is, lemmatization considers the context and converts the word to its meaningful base form, whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors.

We choose lemmatisation in our case, because we believe that the context of the words is important in detecting fake news.

In [None]:
# initiate instance of WordNetLemmatizer class 
lemmatizer = WordNetLemmatizer()

# download necessary resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# input: nltk POS (part-of-speech) tag 
# output: returns the corresponding WordNet POS tag for input tag 
def nltkToWordnet(nltk_tag):
    
    # functionality:
    # checks starting character of nltk POS tag to determine its category 
    # (Adjective, Verb, Noun, Adverb) 
    
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
        
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
        
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
        
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
        
    else:  
        # if tag does not match any of specified categories 
        return None
    
# input: list of tokens 
# output: returns a list of lemmatised tokens 
def lemmatise(tokens):

    # uses nltk.pos_tag to tag each token with its corresponding POS tag 
    pos_tags = nltk.pos_tag(tokens)    
    # create a list to store lemmatised tokens 
    res_words = []

    # iterate through each token and its tag 
    for word, tag in pos_tags:

        # convert nltk POS tag to a WordNet POS tag 
        tag = nltkToWordnet(tag)  

        # if token couldn't be mapped to any WordNet POS tag 
        if tag is None:          
            # original token is appended to 'res_words'
            res_words.append(word)

        # if a valid WordNet POS tag is obtained 
        else:
            # lemmatise the token 
            # append the lematised word to "res_words' 
            res_words.append(lemmatizer.lemmatize(word, tag))

    # return list of lemmatised tokens 
    return res_words


# for every row in each column 'title' and 'text',
# we apply lemmatise() to lemmatise tokens in the row
news['title'] = news['title'].progress_map(lambda x: lemmatise(x))
news['text'] = news['text'].progress_map(lambda x: lemmatise(x))

# view the dataframe
news.head()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
100%|██████████| 44898/44898 [00:22<00:00, 1999.19it/s]
100%|██████████| 44898/44898 [07:47<00:00, 95.99it/s] 


Unnamed: 0,title,text,subject,date,label,label_meaning
0,"[donald, trump, be, right, again, yous, fund, ...","[arkansas, senator, tom, cotton, come, out, in...",politics,"Mar 26, 2016",0,False
1,"[a, hard, knock, life, for, london, rough, sle...","[london, reuters, outside, london, s, piccadil...",worldnews,"December 21, 2017",1,True
2,"[powerful, men, read, hate, tweet, to, female,...","[in, a, powerful, new, web, psa, video, female...",News,"April 26, 2016",0,False
3,"[stand, up, and, cheer, ukip, party, leader, s...","[he, s, be, europe, s, version, of, the, outsp...",politics,"Mar 8, 2016",0,False
4,"[syrian, observatory, islamic, state, capture,...","[beirut, reuters, the, syrian, observatory, fo...",worldnews,"October 1, 2017",1,True


## 6. Remove stopwords

> Stop words are a set of commonly used words in a language. Examples include “a,” “the,” “is,” “are,” etc. They will be filtered out in Natural Language Processing to focus more on the meaningful and informative words.

In [None]:
nltk.download('words') #download list of english words
nltk.download('stopwords') #download list of stopwords

stopWords = stopwords.words('english')
englishWords = set(nltk.corpus.words.words())

# define a function to return tokens that are English but aren't stop words 
# as well as the number of stopwords removed (used for visualisation) 
def remove_stopWords(tokens):
    return [w for w in tokens if (w in englishWords and w not in stopWords)]
    
# for every row in each column 'title' and 'text',
# we apply remove_stopWords() to remove stopwords from the row's contents
# and generate number of stopwords removed 

news['title'] = news['title'].progress_apply(lambda x: remove_stopWords(x))
news['text'] = news['text'].progress_apply(lambda x: remove_stopWords(x))

# view the dataframe
news.head()

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
100%|██████████| 44898/44898 [00:01<00:00, 32499.28it/s]
100%|██████████| 44898/44898 [00:19<00:00, 2263.01it/s]


Unnamed: 0,title,text,subject,date,label,label_meaning
0,"[trump, right, fund, hundred, billion]","[senator, cotton, come, agreement, trump, last...",politics,"Mar 26, 2016",0,False
1,"[hard, knock, life, rough, sleeper]","[outside, piccadilly, theater, ticket, cost, p...",worldnews,"December 21, 2017",1,True
2,"[powerful, men, read, hate, tweet, female, spo...","[powerful, new, web, video, female, sport, wri...",News,"April 26, 2016",0,False
3,"[stand, cheer, party, leader, slam, eu, invasi...","[version, outspoken, ted, time, leader, indepe...",politics,"Mar 8, 2016",0,False
4,"[observatory, state, capture, town, government]","[observatory, human, right, report, state, fig...",worldnews,"October 1, 2017",1,True


# Feature Engineering 

As text data is hard to visualise in itself, we create additional features which we can use to generate more insights about the text data

## 1. Generate Sentiment Polarity

> Polarity refers to the overall sentiment conveyed by a particular text, phrase or word. This polarity is expressed as a numerical rating known as a “sentiment score”.

### Hypothesis: Fake news will have higher sentiment polarity, to provoke extreme emotional reactions from users.  

In [None]:
# for every row in each column 'title' and 'text',
# we apply TextBlob() to generate sentiment polarity score 
news['title_polarity_score'] = news['title'].progress_map(lambda text: TextBlob(str(text)).sentiment.polarity)
news['text_polarity_score'] = news['text'].progress_map(lambda text: TextBlob(str(text)).sentiment.polarity)

# view the dataframe 
news.head()

100%|██████████| 44898/44898 [00:06<00:00, 6824.09it/s]
100%|██████████| 44898/44898 [01:25<00:00, 527.10it/s]


Unnamed: 0,title,text,subject,date,label,label_meaning,title_polarity_score,text_polarity_score
0,"[trump, right, fund, hundred, billion]","[senator, cotton, come, agreement, trump, last...",politics,"Mar 26, 2016",0,False,0.285714,0.106944
1,"[hard, knock, life, rough, sleeper]","[outside, piccadilly, theater, ticket, cost, p...",worldnews,"December 21, 2017",1,True,-0.195833,-0.049499
2,"[powerful, men, read, hate, tweet, female, spo...","[powerful, new, web, video, female, sport, wri...",News,"April 26, 2016",0,False,-0.166667,0.059101
3,"[stand, cheer, party, leader, slam, eu, invasi...","[version, outspoken, ted, time, leader, indepe...",politics,"Mar 8, 2016",0,False,0.0,0.005556
4,"[observatory, state, capture, town, government]","[observatory, human, right, report, state, fig...",worldnews,"October 1, 2017",1,True,0.0,0.055898


## 2. Word and Character Length 

### Hypothesis: Fake news will have shorter word and character lengths, so that people can read it faster, which increases outreach. 

In [None]:
# find the number of words and characters
# for every row in each column 'title' and 'text' 
news['title_char_len'] = news['title'].astype(str).progress_map(len) 
news['title_word_len'] = news['title'].progress_map(lambda x: len(str(x).split()))
news['text_char_len'] = news['text'].astype(str).progress_map(len) 
news['text_word_len'] = news['text'].progress_map(lambda x: len(str(x).split()))

# view dataframe
news.head()

100%|██████████| 44898/44898 [00:00<00:00, 1554646.30it/s]
100%|██████████| 44898/44898 [00:00<00:00, 493223.94it/s]
100%|██████████| 44898/44898 [00:00<00:00, 1161869.82it/s]
100%|██████████| 44898/44898 [00:01<00:00, 41783.14it/s]


Unnamed: 0,title,text,subject,date,label,label_meaning,title_polarity_score,text_polarity_score,title_char_len,title_word_len,text_char_len,text_word_len
0,"[trump, right, fund, hundred, billion]","[senator, cotton, come, agreement, trump, last...",politics,"Mar 26, 2016",0,False,0.285714,0.106944,48,5,2197,222
1,"[hard, knock, life, rough, sleeper]","[outside, piccadilly, theater, ticket, cost, p...",worldnews,"December 21, 2017",1,True,-0.195833,-0.049499,45,5,3547,362
2,"[powerful, men, read, hate, tweet, female, spo...","[powerful, new, web, video, female, sport, wri...",News,"April 26, 2016",0,False,-0.166667,0.059101,99,11,1452,154
3,"[stand, cheer, party, leader, slam, eu, invasi...","[version, outspoken, ted, time, leader, indepe...",politics,"Mar 8, 2016",0,False,0.0,0.005556,92,10,1069,113
4,"[observatory, state, capture, town, government]","[observatory, human, right, report, state, fig...",worldnews,"October 1, 2017",1,True,0.0,0.055898,57,5,1347,131


## 3. Parts-of-speech tagging (POS)

> POS tagging is process of labeling words in a text with their corresponding parts of speech (e.g., noun, verb, adjective), to better understand the grammatical structure of news articles' titles and texts.

### Hypothesis: Fake news will have more adjectives and adverbs than real news, to provide a vivid fake description to provoke extreme emotional reaction among users. 

In [None]:
# Download required resources for POS
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

# Function to return new columns for POS count for any sentence
def pos_tagging(sentence):
    # Create temp dict
    pos_dict = {}
    # Get pos tagging in list
    pos_list = nltk.pos_tag(sentence)
    # Simplify
    pos_list = [nltk.map_tag('en-ptb', 'universal', tag) for _, tag in pos_list]
    pos_dict = Counter(pos_list)
    return [pos_dict['ADJ'] if 'ADJ' in pos_dict else 0, 
            pos_dict['ADV'] if 'ADV' in pos_dict else 0, 
            pos_dict['NOUN'] if 'NOUN' in pos_dict else 0,
            pos_dict['NUM'] if 'NUM' in pos_dict else 0,
            pos_dict['PRON'] if 'PRON' in pos_dict else 0,
            pos_dict['VERB'] if 'VERB' in pos_dict else 0]

# Add 1 column per pos type in Title
news['title_pos_adj'], news['title_pos_adv'], news['title_pos_noun'], news['title_pos_num'], news['title_pos_pron'], news['title_pos_verb'] = zip(*news['title'].progress_map(pos_tagging))
                                                       
# Add 1 column per pos type in Text
news['text_pos_adj'], news['text_pos_adv'], news['text_pos_noun'], news['text_pos_num'], news['text_pos_pron'], news['text_pos_verb'] = zip(*news['text'].progress_map(pos_tagging))

# view dataframe
news.head()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
100%|██████████| 44898/44898 [00:14<00:00, 2994.56it/s]
100%|██████████| 44898/44898 [04:47<00:00, 156.41it/s]


Unnamed: 0,title,text,subject,date,label,label_meaning,title_polarity_score,text_polarity_score,title_char_len,title_word_len,...,title_pos_noun,title_pos_num,title_pos_pron,title_pos_verb,text_pos_adj,text_pos_adv,text_pos_noun,text_pos_num,text_pos_pron,text_pos_verb
0,"[trump, right, fund, hundred, billion]","[senator, cotton, come, agreement, trump, last...",politics,"Mar 26, 2016",0,False,0.285714,0.106944,48,5,...,2,1,0,1,37,10,117,15,2,33
1,"[hard, knock, life, rough, sleeper]","[outside, piccadilly, theater, ticket, cost, p...",worldnews,"December 21, 2017",1,True,-0.195833,-0.049499,45,5,...,3,0,0,0,69,15,186,9,1,65
2,"[powerful, men, read, hate, tweet, female, spo...","[powerful, new, web, video, female, sport, wri...",News,"April 26, 2016",0,False,-0.166667,0.059101,99,11,...,4,0,0,3,28,11,96,0,0,17
3,"[stand, cheer, party, leader, slam, eu, invasi...","[version, outspoken, ted, time, leader, indepe...",politics,"Mar 8, 2016",0,False,0.0,0.005556,92,10,...,8,0,0,1,24,12,48,0,1,26
4,"[observatory, state, capture, town, government]","[observatory, human, right, report, state, fig...",worldnews,"October 1, 2017",1,True,0.0,0.055898,57,5,...,4,0,0,0,30,2,82,1,1,12


Since no columns are empty strings, no empty string will have to be replaced by NA values.

# Save cleaned data

In [None]:
news.to_csv('cleaned_news.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9f23536d-6e62-418a-b94b-ca9356b2599b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>