# Data Cleaning
- To prepare dataset after cleaning in order to reduce file size and faster repeated operation.
- Check EDA.ipynb for explanation of cleaning items.

In [1]:
import time
import pandas as pd

def clean_dataset(fo, fi='data/WELFake_Dataset.csv'):
    
    begin = time.time()
    
    df = pd.read_csv(fi)
    
    ##------ Drop data we don't use 
    df.dropna(inplace=True)
    df.drop_duplicates()
    df.drop(['Unnamed: 0'], axis=1, inplace=True) # Unused column

    ##------ Reduce memory size
    df = df.astype({'label': 'category'})

    ##------ Drop rows with too short title or text
    ##------ For deployment, entry with this condition will give a warning message

    # avoid empty or website address
    df = df[(df.label==0)|(df.title.str.split().str.len()>=2)] 

    # you need at least 5 words to describe who, when, where, what, how (no why for breaking news)
    df = df[(df.label==0)|(df.text.str.split().str.len()>=5)] 

    ##------ Avoid bias

    # topic bias
    df = df[~df.title.str.contains('Korea|China|Syria|EU|Iran|Brexit|Mexico|Turkey|Saudi')] # real only
    df = df[~df.title.str.contains('Wikileaks')] # fake only

    # remove journal names
    df.replace('- The Onion - America\'s Finest News Source','',inplace=True) # real news only
    df.replace('- The New York Times','',inplace=True) # real news only
    df.replace('- Breitbart','',inplace=True) # real news only
    df.replace('Factbox:','',inplace=True) # real news only
    df.replace('Breaking:','',inplace=True) # fake news only

    # clickbait in fake news
    df.replace('[^\s\w][Vv][Ii][Dd][Ee][Oo][^\s\w]','',inplace=True)
    df.replace('[^\s\w][Ww][Aa][Tt][Cc][Hh][^\s\w]','',inplace=True)
    df.replace('[^\s\w][Ii][Mm][Aa][Gg][Ee][^\s\w]','',inplace=True)
    df.replace('[^\s\w][Dd][Ee][Tt][Aa][Ii][Ll][Ss][^\s\w]','',inplace=True)

    #  Fake news likely to have them
    #df['fuss'] = df.title.str.contains('[Ww][Oo][Ww]!')|df.title.str.contains('[Ll][Oo][Ll]!')
    df.replace('[Ww][Oo][Ww]!','',inplace=True)
    df.replace('[Ll][Oo][Ll]!','',inplace=True)

    # Fake news likely to have it
    #df['slang'] = df.title.str.contains('[\w]+[\*]+[\w]+')

    # Social media or website address
    df.replace('[^\s]*[\@]+[^\s]*','@SocialMediaAccount',inplace=True)
    df.replace('[^\s]*//[^\s]+[.][^\s]+','__httpAddr__',inplace=True)
    
    df.to_csv(fo, index=False)
    
    end = time.time()
    
    print('Took',end-begin)

clean_dataset('data/cleaned.csv')

Took 14.811845779418945
