In [None]:
import pandas as pd
import cleantext as ct
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Define path of raw dataset and model below

In [None]:
fake_news_path = '..\\data\\995,000_rows.csv'

liar_path = 'liar_dataset\\valid.tsv'

## Sort label types:
https://github.com/several27/FakeNewsCorpus/blob/v1.0/README.md \
https://www.rand.org/research/projects/truth-decay/fighting-disinformation/search/items/opensources.html
- Fake:                    
    - Sources that entirely fabricate information, disseminate deceptive content, or grossly distort actual news reports
- Satire:   
    - Sources that use humor, irony, exaggeration, ridicule, and false information to comment on current events.
- Bias:       
    - Sources that come from a particular point of view and may rely on propaganda, decontextualized information, and opinions distorted as facts.
- Conspiracy:  
    - Sources that are well-known promoters of kooky conspiracy theories.
- Rumor:
    - ???
- State News:           state	    
    - Sources in repressive states operating under government sanction.
- Junksci:	    
    - Sources that promote pseudoscience, metaphysics, naturalistic fallacies, and other scientifically dubious claims.
- Hate:        
    - Sources that actively promote racism, misogyny, homophobia, and other forms of discrimination.
- Clickbait:   
    - Sources that provide generally credible content, but use exaggerated, misleading, or questionable headlines, social media descriptions, and/or images.
- Unreliable:  
    - Sources that may be reliable but whose contents require further verification.
- Political:   
    - Sources that provide generally verifiable information in support of certain points of view or political orientations.
- Reliable:    
    - Sources that circulate news and information in a manner consistent with traditional and ethical practices in journalism (Remember: even credible sources sometimes rely on clickbait-style headlines or occasionally make mistakes. No news organization is perfect, which is why a healthy news diet consists of multiple sources of information).

### Based on the label explanations above from the dataset Readme.md, the types are divided into two main groups:
- 1 = fake
- 0 = not fake

In [None]:
fake_dict = {'fake'         : '1',
             'satire'       : '1',
             'bias'         : '1',
             'conspiracy'   : '1',
             'rumor'        : '1',
             'state'        : '0',
             'junksci'      : '1', 
             'hate'         : '1',
             'clickbait'    : '0',
             'unreliable'   : '0',
             'political'    : '0',
             'reliable'     : '0'}

## Load liar data in same format as FakeNews
https://datasets.activeloop.ai/docs/ml/datasets/liar-dataset/

In [None]:
liar_dict = {'true'         : 0, 
             'false'        : 1, 
             'half-true'    : 1, 
             'pants-fire'   : 1, 
             'barely-true'  : 0, 
             'mostly-true'  : 0}

In [None]:
def Clean(df):
    print('Clean')
    for i in range(len(df.content)):
        df.loc[i, "content"] = ct.clean(text =  df.loc[i, "content"],
                                        fix_unicode = True,
                                        to_ascii = True,
                                        lower = True,
                                        normalize_whitespace = True,
                                        no_line_breaks = True,
                                        strip_lines = True,
                                        keep_two_line_breaks = False,
                                        no_urls = True,
                                        no_emails = True,
                                        no_phone_numbers = False,
                                        no_numbers = True,
                                        no_digits = False,
                                        no_currency_symbols = False,
                                        no_punct = False,
                                        no_emoji = False,
                                        replace_with_url = "<URL>",
                                        replace_with_email = "<EMAIL>",
                                        replace_with_phone_number = "<PHONE>",
                                        replace_with_number = "<NUM>",
                                        replace_with_digit = "<DIG>",
                                        replace_with_currency_symbol = "<CUR>",
                                        replace_with_punct = "",
                                        lang = "en")
    return df

In [None]:
def OpenRawLiar(path):
    print('OpenRawLiar')
    df = pd.read_table(path, header=None)
    df.rename(columns={1: 'type', 2: 'content'}, inplace=True)
    df.drop(columns=[0,3,4,5,6,7,8,9,10,11,12,13], inplace=True)
    df.drop
    return df
    
def OpenRawFakeNews(path):
    print('OpenRawFakeNews')
    usecols = ['type', 'content']
    df = pd.read_csv(path, usecols=usecols)
    return df

In [None]:
def RemoveUnknownTypes(df:pd.DataFrame, type_dict:dict):
    print('RemoveUnknownTypes')
    indexes = []
    for i in range(len(df)):
        if not (df.loc[i, "type"] in type_dict.keys()):
            indexes += [i]
    
    df.drop(index=indexes, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [None]:
def XyData(df:pd.DataFrame, type_dict:dict): 
    #df = df.apply(lambda row: row[df['type'].isin(type_dict.keys())]) # keep only rows with known types 
    df = RemoveUnknownTypes(df, type_dict)
    
    df.dropna(subset=['type', 'content'], inplace=True) #remove rows without content or type
    df.reset_index(drop=True, inplace=True) #reset index
    
    y = pd.DataFrame(df['type'].replace(type_dict))
    
    df.drop(columns=['type'], inplace=True)

    #X = Clean(df)
    X = df
    return X, y

In [None]:
def Split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size = 0.8,
                                                        random_state = 1,
                                                        shuffle = True)

    X_test, X_val, y_test, y_val = train_test_split(X_test,
                                                    y_test,
                                                    test_size= 0.5,
                                                    random_state = 1,
                                                    shuffle = True)

    return X_train, X_test, X_val, y_train, y_test, y_val

In [None]:
def MakeFakeNews():
    X_train, X_test, X_val, y_train, y_test, y_val = Split(*XyData(OpenRawFakeNews(fake_news_path), fake_dict))
    print('Saving')
    X_train.to_csv('data\\X_train', index=False)
    X_test.to_csv('data\\X_test', index=False)
    X_val.to_csv('data\\X_val', index=False)
    y_train.to_csv('data\\y_train', index=False) 
    y_test.to_csv('data\\y_test', index=False)
    y_val.to_csv('data\\y_val', index=False)

def MakeLiar():
    X_liar, y_liar = XyData(OpenRawLiar(liar_path), liar_dict)
    print('Saving')
    X_liar.to_csv('data\\X_liar', index=False)
    y_liar.to_csv('data\\y_liar', index=False)

MakeFakeNews()
MakeLiar()