In [1]:
# import dependencies
import pandas as pd
import numpy as np
import csv
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
fake_news_df = pd.read_csv("Input Resource/fake_news.csv")
display(fake_news_df)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


## Data Exploring and Cleaning

In [3]:
# check missing values in dataset
fake_news_df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [4]:
# replace NaN values in 'title' and 'text'. 
fake_news_df['title'].fillna("", inplace=True)
fake_news_df['text'].fillna("", inplace=True)

In [5]:
# combine 'title' and 'text' into a one single variable
fake_news_df['content'] = fake_news_df['title'] + ' ' + fake_news_df['text']
fake_news_df.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Why the Trut...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [6]:
# drop unnecessary column
fake_news_df.drop(["id", "author"],axis=1,inplace=True)

- [ ] Handling Missing Values: Drop or impute missing values depending on their importance and quantity.</br>
- [ ] Consistency Check: Ensure that all the data is consistent. </br>

## Data-preprocessing

In [14]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')

# Create global instances and sets for efficiency
lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruddysimonpour/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ruddysimonpour/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
class NewsPreprocessor:
    
    def __init__(self):
        nltk.download('stopwords')
        nltk.download('wordnet')
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords_set = set(stopwords.words('english'))

    def preprocess(self, content):
        content = self._remove_non_alpha(content)
        content = self._lowercase(content)
        content = self._remove_stopwords_and_lemmatize(content)
        return content

    def _remove_non_alpha(self, text):
        return re.sub('[^a-zA-Z]+', ' ', text)

    def _lowercase(self, text):
        return text.lower()

    def _remove_stopwords_and_lemmatize(self, text):
        words = text.split()
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stopwords_set]
        return ' '.join(lemmatized_words)

In [16]:
import dask.dataframe as dd

preprocessor = NewsPreprocessor()

dask_df = dd.from_pandas(fake_news_df, npartitions=4)  

dask_df['content'] = dask_df['content'].map_partitions(lambda df: df.apply(lambda x: preprocessor.preprocess(x))).\
compute(scheduler='multiprocessing')

# convert back to pandas dataframe 
fake_news_df = dask_df.compute()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruddysimonpour/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ruddysimonpour/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Descriptive Statistics

In [23]:
def descriptive_stats(tokens, num_tokens = 5, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens, 
        number of characters, lexical diversity (https://en.wikipedia.org/wiki/Lexical_diversity), 
        and num_tokens most common tokens. Return a list with the number of tokens, number
        of unique tokens, lexical diversity, and number of characters. 
    
    """

    # Fill in the correct values here. 
    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))
    if num_tokens ==0:
        lexical_diversity = 0
    else:
        lexical_diversity = num_unique_tokens / num_tokens
    num_characters = len("".join(tokens))
    
    
    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
    
        
    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])



In [27]:
results = dask_df['content'].map_partitions(lambda fake_news_df: fake_news_df.apply\
                                            (lambda x: descriptive_stats(x.split(), verbose=False))).compute()

dask_df = dask_df.assign(num_tokens=results.apply(lambda x: x[0]), 
                 num_unique_tokens=results.apply(lambda x: x[1]),
                 lexical_diversity=results.apply(lambda x: x[2]),
                 num_characters=results.apply(lambda x: x[3]))


In [25]:
fake_news_df = dask_df.compute()

In [26]:
fake_news_df

Unnamed: 0,title,text,label,content,num_tokens,num_unique_tokens,lexical_diversity,num_characters
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide even see comey letter jason cha...,443,269,0.607223,2815
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,flynn hillary clinton big woman campus breitba...,374,284,0.759358,2361
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fired truth might get fired oc...,696,458,0.658046,4428
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,civilian killed single u airstrike identified ...,311,183,0.588424,1891
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,iranian woman jailed fictional unpublished sto...,96,67,0.697917,635
...,...,...,...,...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Rapper T. I. unloaded on black celebrities who...,0,rapper trump poster child white supremacy rapp...,181,143,0.790055,1086
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",When the Green Bay Packers lost to the Washing...,0,n f l playoff schedule matchup odds new york t...,585,304,0.519658,3122
20797,Macy’s Is Said to Receive Takeover Approach by...,The Macy’s of today grew from the union of sev...,0,macy said receive takeover approach hudson bay...,460,302,0.656522,2820
20798,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal...",1,nato russia hold parallel exercise balkan nato...,174,113,0.649425,1158


In [28]:
def aggregate_stats(dataframe):
    """
    Given a dataframe with columns 'num_tokens', 'num_unique_tokens', 'lexical_diversity', 
    and 'num_characters', calculate the min, max, mean, and median for each column.

    Returns a dictionary with keys as the column names and values as another dictionary
    containing the min, max, mean, and median.
    """

    agg_funcs = ['min', 'max', 'mean', 'median']
    
    results = dataframe[['num_tokens', 'num_unique_tokens', 'lexical_diversity', 'num_characters']].agg(agg_funcs)

    stats_dict = results.to_dict()

    return stats_dict


In [29]:
stats_results = aggregate_stats(fake_news_df)
stats_results


{'num_tokens': {'min': 0.0,
  'max': 12065.0,
  'mean': 426.1338942307692,
  'median': 319.0},
 'num_unique_tokens': {'min': 0.0,
  'max': 3366.0,
  'mean': 260.86572115384615,
  'median': 209.0},
 'lexical_diversity': {'min': 0.0,
  'max': 1.0,
  'mean': 0.6746853211104239,
  'median': 0.6652849607395062},
 'num_characters': {'min': 0.0,
  'max': 67861.0,
  'mean': 2629.1567307692308,
  'median': 1971.0}}

## Descriptive Analysis 

### AlJazeera News

- [ ] Import all 5 Aljazeera news dataset
- [ ] Apply the NewsProcessor Class to the dataset for data preporcessing and normalizing the data. 
- [ ] Then apply dask partitioning to the dataframe. Follow my example. 
- [ ] Last; apply aggregate_stats function 