In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## Models

In [None]:
def SimpelModel():
    model = LogisticRegression(random_state=0, max_iter=1000)
    pipeline = make_pipeline(StandardScaler(with_mean=False, with_std=False), model)
    return pipeline

def AdvancedModel():
    model = LinearSVC(max_iter=1000, dual='auto')
    return model

## Define path of raw dataset and model below

In [None]:
fake_news_data = '..\\data\\995,000_rows.csv'

#model = SimpelModel()
model = AdvancedModel()

liar_data = 'liar_dataset\\valid.tsv'

## Sort label types:
https://github.com/several27/FakeNewsCorpus/blob/v1.0/README.md \
https://www.rand.org/research/projects/truth-decay/fighting-disinformation/search/items/opensources.html
- Fake:                    
    - Sources that entirely fabricate information, disseminate deceptive content, or grossly distort actual news reports
- Satire:   
    - Sources that use humor, irony, exaggeration, ridicule, and false information to comment on current events.
- Bias:       
    - Sources that come from a particular point of view and may rely on propaganda, decontextualized information, and opinions distorted as facts.
- Conspiracy:  
    - Sources that are well-known promoters of kooky conspiracy theories.
- Rumor:
    - ???
- State News:           state	    
    - Sources in repressive states operating under government sanction.
- Junksci:	    
    - Sources that promote pseudoscience, metaphysics, naturalistic fallacies, and other scientifically dubious claims.
- Hate:        
    - Sources that actively promote racism, misogyny, homophobia, and other forms of discrimination.
- Clickbait:   
    - Sources that provide generally credible content, but use exaggerated, misleading, or questionable headlines, social media descriptions, and/or images.
- Unreliable:  
    - Sources that may be reliable but whose contents require further verification.
- Political:   
    - Sources that provide generally verifiable information in support of certain points of view or political orientations.
- Reliable:    
    - Sources that circulate news and information in a manner consistent with traditional and ethical practices in journalism (Remember: even credible sources sometimes rely on clickbait-style headlines or occasionally make mistakes. No news organization is perfect, which is why a healthy news diet consists of multiple sources of information).

### Based on the label explanations above from the dataset Readme.md, the types are divided into two main groups:
- 1 = fake
- 0 = not fake

In [None]:
fake_dict = {'fake'         : '1',
             'satire'       : '1',
             'bias'         : '1',
             'conspiracy'   : '1',
             'rumor'        : '1',
             'state'        : '0',
             'junksci'      : '1', 
             'hate'         : '1',
             'clickbait'    : '0',
             'unreliable'   : '0',
             'political'    : '0',
             'reliable'     : '0'}

## Load liar data in same format as FakeNews
https://datasets.activeloop.ai/docs/ml/datasets/liar-dataset/

In [None]:
liar_dict = {'true'         : 0, 
             'false'        : 1, 
             'half-true'    : 1, 
             'pants-fire'   : 1, 
             'barely-true'  : 0, 
             'mostly-true'  : 0}

def LiarData(csv_path):
    df = pd.read_table(csv_path, header=None)
    df.rename(columns={1: 'type', 2: 'content'}, inplace=True)
    df.drop(columns=[0,3,4,5,6,7,8,9,10,11,12,13], inplace=True)
    
    df.dropna(subset=['type', 'content'], inplace=True) #remove rows without content or type
    df.reset_index(drop=True, inplace=True) #reset index
    
    y = pd.DataFrame(df['type'].replace(liar_dict))
    print(y.type.value_counts())
    
    df.drop(columns=['type'], inplace=True)
    X = df
    return X, y

X_liar, y_liar = LiarData(liar_data)

## Exclude articles without valid type

In [None]:
def RemoveUnknownTypes(input, unknown_types, output):
    df = pd.read_csv(input)
    indexes = []
    rows = []
    for i in range(len(df)):
        if not (df.loc[i, "type"] in fake_dict.keys()):
            indexes += [i]
            rows += [df.loc[i]]
            
    unknown = pd.DataFrame(rows)
    unknown.to_csv(unknown_types, index=False)
    
    df.drop(index=indexes, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv(output, index=False)
    del(df)
    
RemoveUnknownTypes(fake_news_data, 'UnknownTypes.csv',  'Data-UnknownTypes.csv')

## Divide the data into X and y data

In [None]:
def XyData(csv_path, X_path, y_path):
    usecols = ['type', 'content']#, 'url']

    df = pd.read_csv(csv_path, usecols=usecols)
    
    df.dropna(subset=['content'], inplace=True) #remove articles without content
    df.reset_index(drop=True, inplace=True) #reset index
    
    y = pd.DataFrame(df['type'].replace(fake_dict))
    
    df.drop(columns=['type'], inplace=True)
    # df['url'].fillna(value = '')
    X = df
         
    X.to_csv(X_path, index=False) 
    y.to_csv(y_path, index=False)
    del(df, X, y)

XyData('Data-UnknownTypes.csv', 'X.csv', 'y.csv')

## Check label distribution

In [None]:
def CountAndPrintTypes(csv_path):
    df = pd.read_csv(csv_path) 
    print(df.type.value_counts())
    del(df)
    
CountAndPrintTypes('y.csv')

## Split data into train, test and validation sections

In [None]:
def Split(X_path, y_path):
    df_X = pd.read_csv(X_path)
    df_y = pd.read_csv(y_path)
    X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                        df_y,
                                                        train_size = 0.8,
                                                        random_state = 1,
                                                        shuffle = True)

    X_test, X_val, y_test, y_val = train_test_split(X_test,
                                                    y_test,
                                                    test_size= 0.5,
                                                    random_state = 1,
                                                    shuffle = True)
    del(df_X, df_y)
    return X_train, X_test, X_val, y_train, y_test, y_val

X_train, X_test, X_val, y_train, y_test, y_val = Split('X.csv', 'y.csv')

## Vectorize the chosen prediction data 

In [None]:
def Vectorizer(train, test, val, liar):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.75)

    col_vectorizer = make_column_transformer((vectorizer, 'content'),
                                             #(vectorizer, 'url'),
                                             remainder='drop')

    X_train_vec = col_vectorizer.fit_transform(train)
    X_test_vec = col_vectorizer.transform(test)
    X_val_vec = col_vectorizer.transform(val)
    X_liar_vec = col_vectorizer.transform(liar)
    
    return X_train_vec, X_test_vec, X_val_vec, X_liar_vec

X_train_vec, X_test_vec, X_val_vec, X_liar_vec = Vectorizer(X_train, X_test, X_val, X_liar)

## Train model

In [None]:
def TrainModel(model, X, y):
    return model.fit(X, y.type)

trained_model = TrainModel(model, X_train_vec, y_train)

## Predict and test score

In [None]:
def Predict(model, X_data):
    y_pred = model.predict(X_data)   
    return y_pred

def ConfMatrix(y_data, y_pred):
    matrix = confusion_matrix(y_data,y_pred, labels=[1,0])
    TP = matrix[0][0]
    TN = matrix[1][1]
    FP = matrix[1][0]
    FN = matrix[0][1]
    print(f'Confusion matrix:\n {matrix}')
    print('\n')
    print(f'TP: {TP}')
    print(f'TN: {TN}')
    print(f'FP: {FP}')
    print(f'FN: {FN}')
    print('\n')
    return TP, TN, FP, FN

def Score(TP, TN, FP, FN):
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = 2 * ((precision*recall)/(precision+recall))
    print(f'Accuracy: {accuracy}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'F1: {f1}')
    
def test(model,X_data ,y_data):
    y_pred = Predict(model, X_data)
    TP, TN, FP, FN = ConfMatrix(y_data, y_pred) 
    Score(TP, TN, FP, FN)
    
test(trained_model, X_val_vec, y_val)
#test(trained_model, X_liar_vec, y_liar)

## Crossvalidate

In [None]:
def CrossValidate(model, X_path, y_path):    
    df_X = pd.read_csv(X_path) 
    df_y = pd.read_csv(y_path)

    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.75)

    col_vectorizer = make_column_transformer((vectorizer, 'content'),
                                             #(vectorizer, 'url'),
                                             remainder='drop')
    
    pipeline = make_pipeline([col_vectorizer, model])

    cv = KFold(n_splits=5, shuffle=True)
    score = cross_val_score(pipeline, df_X, df_y.type, cv=cv)

    print(f'K Fold Accuracy: {score}%')

#CrossValidate('X.csv', 'y.csv')