In [64]:
# import dependencies
import pandas as pd
import numpy as np
import csv
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
fake_news_df = pd.read_csv(r"C:\Users\Mohammad\Downloads\archive (8)\fake_news.csv")
#fake_news_df = pd.read_csv("Input Resource/fake_news.csv")
display(fake_news_df)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


## Data Exploring and Cleaning

In [66]:
# check missing values in dataset
fake_news_df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [67]:
# replace NaN values in 'title' and 'text'. 
fake_news_df['title'].fillna("", inplace=True)
fake_news_df['text'].fillna("", inplace=True)

In [68]:
# combine 'title' and 'text' into a one single variable
fake_news_df['content'] = fake_news_df['title'] + ' ' + fake_news_df['text']
fake_news_df.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Why the Trut...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [69]:
# drop unnecessary column
fake_news_df.drop(["id", "author"],axis=1,inplace=True)

- [ ] Handling Missing Values: Drop or impute missing values depending on their importance and quantity.</br>
- [ ] Consistency Check: Ensure that all the data is consistent. </br>

## Data-preprocessing

In [70]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')

# Create global instances and sets for efficiency
lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohammad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohammad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [71]:
class NewsPreprocessor:
    
    def __init__(self):
        nltk.download('stopwords')
        nltk.download('wordnet')
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords_set = set(stopwords.words('english'))

    def preprocess(self, content):
        content = self._remove_non_alpha(content)
        content = self._lowercase(content)
        content = self._remove_stopwords_and_lemmatize(content)
        return content

    def _remove_non_alpha(self, text):
        return re.sub('[^a-zA-Z]+', ' ', text)

    def _lowercase(self, text):
        return text.lower()

    def _remove_stopwords_and_lemmatize(self, text):
        words = text.split()
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stopwords_set]
        return ' '.join(lemmatized_words)

In [72]:
import dask.dataframe as dd

preprocessor = NewsPreprocessor()

dask_df = dd.from_pandas(fake_news_df, npartitions=4)  

dask_df['content'] = dask_df['content'].map_partitions(
    lambda df: df.apply(lambda x: preprocessor.preprocess(str(x)))
).compute(scheduler='multiprocessing')

# convert back to pandas dataframe 
fake_news_df = dask_df.compute()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohammad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohammad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Descriptive Statistics

In [73]:
def descriptive_stats(tokens, num_tokens = 5, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens, 
        number of characters, lexical diversity (https://en.wikipedia.org/wiki/Lexical_diversity), 
        and num_tokens most common tokens. Return a list with the number of tokens, number
        of unique tokens, lexical diversity, and number of characters. 
    
    """

    # Fill in the correct values here. 
    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))
    if num_tokens ==0:
        lexical_diversity = 0
    else:
        lexical_diversity = num_unique_tokens / num_tokens
    num_characters = len("".join(tokens))
    
    
    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
    
        
    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])



In [74]:
results = dask_df['content'].map_partitions(lambda fake_news_df: fake_news_df.apply\
                                            (lambda x: descriptive_stats(str(x).split(), verbose=False))).compute()

In [75]:
import ast

# Convert the string representation of list to actual list
results = results.apply(ast.literal_eval)


In [76]:
# Convert the Series to a DataFrame
results = pd.DataFrame(results.tolist(), columns=['num_tokens', 'num_unique_tokens', 'lexical_diversity', 'num_characters'])


In [77]:
results

Unnamed: 0,num_tokens,num_unique_tokens,lexical_diversity,num_characters
0,443,269,0.607223,2815
1,374,284,0.759358,2361
2,696,458,0.658046,4428
3,311,183,0.588424,1891
4,96,67,0.697917,635
...,...,...,...,...
20795,181,143,0.790055,1086
20796,585,304,0.519658,3122
20797,460,302,0.656522,2820
20798,174,113,0.649425,1158


In [78]:
if isinstance(fake_news_df, dd.DataFrame):
    fake_news_df = fake_news_df.compute()  # This brings the Dask DataFrame into memory as a Pandas DataFrame


In [79]:
fake_news_df.reset_index(drop=True, inplace=True)
results.reset_index(drop=True, inplace=True)


In [80]:
fake_news_df = pd.concat([fake_news_df, results], axis=1)


In [81]:
fake_news_df

Unnamed: 0,title,text,label,content,num_tokens,num_unique_tokens,lexical_diversity,num_characters
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide even see comey letter jason cha...,443,269,0.607223,2815
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,flynn hillary clinton big woman campus breitba...,374,284,0.759358,2361
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fired truth might get fired oc...,696,458,0.658046,4428
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,civilian killed single u airstrike identified ...,311,183,0.588424,1891
4,Iranian woman jailed for fictional unpublished...,Print An Iranian woman has been sentenced to ...,1,iranian woman jailed fictional unpublished sto...,96,67,0.697917,635
...,...,...,...,...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Rapper T. I. unloaded on black celebrities who...,0,rapper trump poster child white supremacy rapp...,181,143,0.790055,1086
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",When the Green Bay Packers lost to the Washing...,0,n f l playoff schedule matchup odds new york t...,585,304,0.519658,3122
20797,Macy’s Is Said to Receive Takeover Approach by...,The Macy’s of today grew from the union of sev...,0,macy said receive takeover approach hudson bay...,460,302,0.656522,2820
20798,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal...",1,nato russia hold parallel exercise balkan nato...,174,113,0.649425,1158


In [82]:
def aggregate_stats(dataframe):
    """
    Given a dataframe with columns 'num_tokens', 'num_unique_tokens', 'lexical_diversity', 
    and 'num_characters', calculate the min, max, mean, and median for each column.

    Returns a dictionary with keys as the column names and values as another dictionary
    containing the min, max, mean, and median.
    """

    agg_funcs = ['min', 'max', 'mean', 'median']
    
    results = dataframe[['num_tokens', 'num_unique_tokens', 'lexical_diversity', 'num_characters']].agg(agg_funcs)

    stats_dict = results.to_dict()

    return stats_dict


In [83]:
stats_results = aggregate_stats(fake_news_df)
stats_results


{'num_tokens': {'min': 0.0,
  'max': 12065.0,
  'mean': 426.1338942307692,
  'median': 319.0},
 'num_unique_tokens': {'min': 0.0,
  'max': 3366.0,
  'mean': 260.86572115384615,
  'median': 209.0},
 'lexical_diversity': {'min': 0.0,
  'max': 1.0,
  'mean': 0.6746853211104239,
  'median': 0.6652849607395062},
 'num_characters': {'min': 0.0,
  'max': 67861.0,
  'mean': 2629.1567307692308,
  'median': 1971.0}}

## Descriptive Analysis 

### AlJazeera News

In [107]:
#import Aljazeera news
news_AlJazeera_01 = pd.read_csv("C:\\Users\\Mohammad\\Desktop\\AlJazeera\\news_articles_01.csv")
news_AlJazeera_02 = pd.read_csv("C:\\Users\\Mohammad\\Desktop\\AlJazeera\\news_articles_02.csv")
news_AlJazeera_03 = pd.read_csv("C:\\Users\\Mohammad\\Desktop\\AlJazeera\\news_articles_03.csv")
news_AlJazeera_04 = pd.read_csv("C:\\Users\\Mohammad\\Desktop\\AlJazeera\\news_articles_04.csv")
news_AlJazeera_05 = pd.read_csv("C:\\Users\\Mohammad\\Desktop\\AlJazeera\\news_AlJazeera.xls")

In [108]:
frames = [news_AlJazeera_01,news_AlJazeera_02,news_AlJazeera_03,news_AlJazeera_04,news_AlJazeera_05]
AlJazeera_df = pd.concat(frames, ignore_index = True)
AlJazeera_df

Unnamed: 0,title,link,description,content
0,Trump and ‘MAGA Re­pub­li­can ex­trem­ists’ th...,https://www.aljazeera.com/news/2023/9/28/trump...,Biden has un­der­scored the risk to democ­ra­c...,United States President Joe Biden has lashed o...
1,East Libya com­man­der Haf­tar in Moscow for t...,https://www.aljazeera.com/news/2023/9/28/east-...,"Krem­lin spokesper­son con­firms meet­ing, say...","Renegade general Khalifa Haftar, whose forces ..."
2,Be­larus says Pol­ish he­li­copter vi­o­lat­ed...,https://www.aljazeera.com/news/2023/9/28/belar...,Be­larus has re­peat­ed­ly ac­cused the EU mem...,Belarus has claimed that a Polish helicopter h...
3,"Pho­tos: In Brazil’s Ama­zon, drought af­fects...",https://www.aljazeera.com/gallery/2023/9/28/ph...,"With riv­er lev­els drop­ping, wa­ter trans­po...",
4,Blinken meet­ing In­di­an coun­ter­part amid f...,https://www.aljazeera.com/news/2023/9/28/blink...,Cana­da PM says he re­ceived as­sur­ances that...,US Secretary of State Antony Blinken is meetin...
...,...,...,...,...
421,At least 100 killed as Syr­i­an mil­i­tary col...,https://www.aljazeera.com/news/2023/10/5/syria...,At least 100 peo­ple killed and 240 oth­ers wo...,A drone attack on a military college in Syria’...
422,"What’s fen­tanyl, and why have deaths due to d...",https://www.aljazeera.com/news/2023/10/5/whats...,"In 2021, two-thirds of drug-re­lat­ed over­dos...",The United States is imposing sanctions and la...
423,How is crick­et played? A sim­ple il­lus­trat­...,https://www.aljazeera.com/sports/longform/2023...,Al Jazeera breaks down crick­et terms that wil...,
424,‘My son was afraid. He asked me for a kiss. It...,https://www.aljazeera.com/news/2023/10/5/syria...,The 2013 Lampe­dusa tragedy known as the chil­...,"Lampedusa, Italy –Every year, Refaat Hazima vi..."


## Data Exploring and Cleaning

In [109]:
# check missing values in dataset
AlJazeera_df.isnull().sum()

title           0
link            0
description     0
content        46
dtype: int64

In [110]:
AlJazeera_df['content'] = AlJazeera_df['content'].dropna()

In [111]:
#remove duplicates
AlJazeera_df = AlJazeera_df.drop_duplicates(subset=['content'])

## Data-preprocessing

In [112]:
AlJazeera_dask_df = dd.from_pandas(AlJazeera_df, npartitions=4)  

AlJazeera_dask_df['content'] = AlJazeera_dask_df['content'].map_partitions(
    lambda df: df.apply(lambda x: preprocessor.preprocess(str(x)))
).compute(scheduler='multiprocessing')

In [113]:
# convert back to pandas dataframe 
AlJazeera_df = AlJazeera_dask_df.compute()

In [114]:
AlJazeera_df

Unnamed: 0,title,link,description,content
0,Trump and ‘MAGA Re­pub­li­can ex­trem­ists’ th...,https://www.aljazeera.com/news/2023/9/28/trump...,Biden has un­der­scored the risk to democ­ra­c...,united state president joe biden lashed republ...
1,East Libya com­man­der Haf­tar in Moscow for t...,https://www.aljazeera.com/news/2023/9/28/east-...,"Krem­lin spokesper­son con­firms meet­ing, say...",renegade general khalifa haftar whose force do...
2,Be­larus says Pol­ish he­li­copter vi­o­lat­ed...,https://www.aljazeera.com/news/2023/9/28/belar...,Be­larus has re­peat­ed­ly ac­cused the EU mem...,belarus claimed polish helicopter violated air...
3,"Pho­tos: In Brazil’s Ama­zon, drought af­fects...",https://www.aljazeera.com/gallery/2023/9/28/ph...,"With riv­er lev­els drop­ping, wa­ter trans­po...",na
4,Blinken meet­ing In­di­an coun­ter­part amid f...,https://www.aljazeera.com/news/2023/9/28/blink...,Cana­da PM says he re­ceived as­sur­ances that...,u secretary state antony blinken meeting india...
...,...,...,...,...
420,Kevin Mc­Carthy is out as US House speak­er. W...,https://www.aljazeera.com/podcasts/2023/10/5/k...,Cal­i­for­nia con­gress­man from Cal­i­for­nia...,took round voting u house representative elect...
421,At least 100 killed as Syr­i­an mil­i­tary col...,https://www.aljazeera.com/news/2023/10/5/syria...,At least 100 peo­ple killed and 240 oth­ers wo...,drone attack military college syria homs provi...
422,"What’s fen­tanyl, and why have deaths due to d...",https://www.aljazeera.com/news/2023/10/5/whats...,"In 2021, two-thirds of drug-re­lat­ed over­dos...",united state imposing sanction launching indic...
424,‘My son was afraid. He asked me for a kiss. It...,https://www.aljazeera.com/news/2023/10/5/syria...,The 2013 Lampe­dusa tragedy known as the chil­...,lampedusa italy every year refaat hazima visit...


## Descriptive Statistics for Aljazeera news

In [115]:
AlJazeera_results = AlJazeera_dask_df['content'].map_partitions(lambda AlJazeera_df: AlJazeera_df.apply\
                                            (lambda x: descriptive_stats(str(x).split(), verbose=False))).compute()

In [116]:
# Convert the string representation of list to actual list
AlJazeera_results = AlJazeera_results.apply(ast.literal_eval)

In [117]:
# Convert the Series to a DataFrame
AlJazeera_results = pd.DataFrame(AlJazeera_results.tolist(), columns=['num_tokens', 'num_unique_tokens', 'lexical_diversity', 'num_characters'])

In [118]:
AlJazeera_results

Unnamed: 0,num_tokens,num_unique_tokens,lexical_diversity,num_characters
0,476,298,0.626050,3039
1,217,160,0.737327,1381
2,127,95,0.748031,844
3,1,1,1.000000,2
4,507,285,0.562130,3375
...,...,...,...,...
294,102,91,0.892157,611
295,466,284,0.609442,2912
296,473,286,0.604651,3136
297,1079,631,0.584801,6650


In [119]:
if isinstance(AlJazeera_df, dd.DataFrame):
    AlJazeera_df = AlJazeera_df.compute()  # This brings the Dask DataFrame into memory as a Pandas DataFrame

In [120]:
AlJazeera_df.reset_index(drop=True, inplace=True)
AlJazeera_results.reset_index(drop=True, inplace=True)

In [121]:
AlJazeera_df = pd.concat([AlJazeera_df, AlJazeera_results], axis=1)

In [122]:
AlJazeera_df

Unnamed: 0,title,link,description,content,num_tokens,num_unique_tokens,lexical_diversity,num_characters
0,Trump and ‘MAGA Re­pub­li­can ex­trem­ists’ th...,https://www.aljazeera.com/news/2023/9/28/trump...,Biden has un­der­scored the risk to democ­ra­c...,united state president joe biden lashed republ...,476,298,0.626050,3039
1,East Libya com­man­der Haf­tar in Moscow for t...,https://www.aljazeera.com/news/2023/9/28/east-...,"Krem­lin spokesper­son con­firms meet­ing, say...",renegade general khalifa haftar whose force do...,217,160,0.737327,1381
2,Be­larus says Pol­ish he­li­copter vi­o­lat­ed...,https://www.aljazeera.com/news/2023/9/28/belar...,Be­larus has re­peat­ed­ly ac­cused the EU mem...,belarus claimed polish helicopter violated air...,127,95,0.748031,844
3,"Pho­tos: In Brazil’s Ama­zon, drought af­fects...",https://www.aljazeera.com/gallery/2023/9/28/ph...,"With riv­er lev­els drop­ping, wa­ter trans­po...",na,1,1,1.000000,2
4,Blinken meet­ing In­di­an coun­ter­part amid f...,https://www.aljazeera.com/news/2023/9/28/blink...,Cana­da PM says he re­ceived as­sur­ances that...,u secretary state antony blinken meeting india...,507,285,0.562130,3375
...,...,...,...,...,...,...,...,...
294,Kevin Mc­Carthy is out as US House speak­er. W...,https://www.aljazeera.com/podcasts/2023/10/5/k...,Cal­i­for­nia con­gress­man from Cal­i­for­nia...,took round voting u house representative elect...,102,91,0.892157,611
295,At least 100 killed as Syr­i­an mil­i­tary col...,https://www.aljazeera.com/news/2023/10/5/syria...,At least 100 peo­ple killed and 240 oth­ers wo...,drone attack military college syria homs provi...,466,284,0.609442,2912
296,"What’s fen­tanyl, and why have deaths due to d...",https://www.aljazeera.com/news/2023/10/5/whats...,"In 2021, two-thirds of drug-re­lat­ed over­dos...",united state imposing sanction launching indic...,473,286,0.604651,3136
297,‘My son was afraid. He asked me for a kiss. It...,https://www.aljazeera.com/news/2023/10/5/syria...,The 2013 Lampe­dusa tragedy known as the chil­...,lampedusa italy every year refaat hazima visit...,1079,631,0.584801,6650


In [124]:
AlJazeera_stats_results = aggregate_stats(AlJazeera_df)
AlJazeera_stats_results

{'num_tokens': {'min': 1.0,
  'max': 1269.0,
  'mean': 340.5083612040134,
  'median': 303.0},
 'num_unique_tokens': {'min': 1.0,
  'max': 631.0,
  'mean': 222.5351170568562,
  'median': 208.0},
 'lexical_diversity': {'min': 0.29009433962264153,
  'max': 1.0,
  'mean': 0.6955307652494974,
  'median': 0.6830065359477124},
 'num_characters': {'min': 2.0,
  'max': 8556.0,
  'mean': 2176.7959866220735,
  'median': 1947.0}}