In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Preprosessing

In [2]:
class Preprocessing:
    def __init__(self, df, debug: bool = False, nan_threshold: int = 0.5):
        self._df = df
        self.debug = debug
        self.nan_threshold = nan_threshold
    
    def _set_dtypes(self):
        self._df['id'] = self._df['id'].astype('str')
        self._df['domain'] = self._df['domain'].astype('str')
        self._df['content'] = self._df['content'].astype('str')
        self._df['title'] = self._df['title'].astype('str')
        self._df['authors'] = self._df['authors'].astype('str') # maybe list of authors
        self._df['keywords'] = self._df['keywords'].astype('str') # maybe list of keywords
        self._df['meta_keywords'] = self._df['meta_keywords'].astype('str') # maybe list of meta_keywords
        self._df['meta_description'] = self._df['meta_description'].astype('str')
        self._df['tags'] = self._df['tags'].astype('str') # maybe list of tags
        self._df['summary'] = self._df['summary'].astype('str')
    
    def _remove_duplicates(self):
        self._df = self._df.drop_duplicates()

    def _remove_na(self):
        # Remove nulls from type column
        self._df = self._df[(self._df['type'].notna()) & (self._df['type'] != 'unknown')]

        def na_percentage_normalized(column):
            return self._df[column].isna().sum() / len(self._df[column])

        # Remove columns with more than self.nan_threshold NaN values
        for col in self._df.columns:
            na_percentage = na_percentage_normalized(col)
            if na_percentage > self.nan_threshold:
                self._df = self._df.drop(col, axis=1)
                if self.debug: print(f'Removed column [{col}] due to NaN percentage of {na_percentage:.2f}')

    
    def _remove_empty_strings(self):
        self._df = self._df[self._df['content'] != '']

    def _replace_nan(self):
        self._df = self._df.replace('nan', np.nan)
        self._df['meta_keywords'] = self._df['meta_keywords'].replace("['']", np.nan)
    
    def preprocess(self):
        self._set_dtypes()
        self._remove_duplicates()
        self._replace_nan()
        self._remove_na()
        self._remove_empty_strings()

        if self.debug: self.print_unique_values(['type'])
        
        return self._df
    
    def print_unique_values(self, columns: list = ['type']):
        for col in columns:
            print(f'Unique values for [{col}] out of {len(self._df[col])}: \n {self._df[col].value_counts()}')


In [3]:
def preprocess(df):
    preprocessing = Preprocessing(df, debug=True)
    df = preprocessing.preprocess()
    return df

In [4]:
def df_value_counts(df, columns: list = ['type']):
    for col in columns:
        print(f'Unique values for [{col}] out of {len(df[col])}: \n {df[col].value_counts()}')

# Loading the dataset

In [5]:
reader = pd.read_csv(
    '../data/news/news_sample.csv', 
    encoding='utf-8', 
    chunksize=10000,
    on_bad_lines='warn',
    lineterminator='\n',
    usecols=lambda column: column not in ['Unnamed: 0', 'source', 'scraped_at', 'inserted_at', 'updated_at', 'url']
)

In [6]:


for chunk_number, chunk in enumerate(reader):
    print('')
    print(f'==================== {chunk_number} ====================')
    df = chunk
    df = preprocess(df)
    print('========================================================')
    print('')

    # Preprocess only the first N chunks
    if chunk_number == 3: break



Removed column [keywords] due to NaN percentage of 1.00
Removed column [meta_keywords] due to NaN percentage of 0.90
Removed column [meta_description] due to NaN percentage of 0.78
Removed column [tags] due to NaN percentage of 0.85
Removed column [summary] due to NaN percentage of 1.00
Unique values for [type] out of 9891: 
 type
fake          3663
conspiracy    3161
political      935
junksci        567
unreliable     497
hate           256
reliable       202
satire         200
clickbait      190
bias           157
rumor           63
Name: count, dtype: int64


Removed column [keywords] due to NaN percentage of 1.00
Removed column [meta_keywords] due to NaN percentage of 0.87
Removed column [meta_description] due to NaN percentage of 0.86
Removed column [tags] due to NaN percentage of 0.92
Removed column [summary] due to NaN percentage of 1.00
Unique values for [type] out of 9478: 
 type
fake          6839
political     1788
conspiracy     563
bias           249
junksci         20
u

# EDA

In [7]:
df.head()

Unnamed: 0,id,domain,type,content,title,authors
30000,33694,beforeitsnews.com,fake,Sacramento Seeing E. coli Cluster\n\n% of read...,Sacramento Seeing E. coli Cluster,Marler Blog
30001,33695,beforeitsnews.com,fake,San Diego Bali Hai Restaurant and Arrowhead Lo...,San Diego Bali Hai Restaurant and Arrowhead Lo...,Marler Blog
30002,33696,awarenessact.com,conspiracy,"The Mayan Calendar has made news before, shock...",san bushmen – Awareness Act,"Gerald Sinclair, Elizabeth Deville"
30003,33697,awarenessact.com,conspiracy,Much of the ancient history taught in our scho...,Elizabeth DeVille,"Gerald Sinclair, Elizabeth Deville"
30004,33698,beforeitsnews.com,fake,‘The Team’ via Peggy Black: Consciousness is U...,‘The Team’ via Peggy Black: Consciousness is U...,The Scenario


In [8]:
# get rows with NaN values
df[df.isna().any(axis=1)]

Unnamed: 0,id,domain,type,content,title,authors
30017,33711,beforeitsnews.com,fake,29\n\nBy The 2012 Scenario on Sunday Dec 31 20...,"Stories in the ""2012"" category",
30029,33724,beforeitsnews.com,fake,Is There Something Else Going-On Many of Us ha...,"David Icke talks about Grenfell Fire, a “Hunge...",
30032,33727,beforeitsnews.com,fake,Climate Change Global Temperatures Plummet as ...,Climate Change Global Temperatures Plummet as ...,
30033,33728,beforeitsnews.com,fake,The Beginning of Big Pharma and its Connection...,The Beginning of Big Pharma and its Connection...,
30034,33729,beforeitsnews.com,fake,"Temperature Trend Stalls, Clouds and Volcanic ...","Temperature Trend Stalls, Clouds and Volcanic ...",
...,...,...,...,...,...,...
39990,44852,frontpagemag.com,bias,If you wonder why Hollywood stayed so quiet so...,The Leftist Enablers of Perv Photog Terry Rich...,
39991,44853,frontpagemag.com,bias,"""Betrayed.""\n\nVictor Avila is a survivor. Sof...",An ICE Agent's Quest for Justice,
39995,44857,beforeitsnews.com,fake,More Defending of The Indefensible Ketogenic D...,More Defending of The Indefensible Ketogenic Diet,
39996,44858,beforeitsnews.com,fake,Brilliant Light Power September Update\n\n% of...,Brilliant Light Power September Update,
