In [198]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.compose import make_column_transformer

## Label types:
https://github.com/several27/FakeNewsCorpus/blob/v1.0/README.md \
https://www.rand.org/research/projects/truth-decay/fighting-disinformation/search/items/opensources.html
- Fake:                    
    - Sources that entirely fabricate information, disseminate deceptive content, or grossly distort actual news reports
- Satire:   
    - Sources that use humor, irony, exaggeration, ridicule, and false information to comment on current events.
- Bias:       
    - Sources that come from a particular point of view and may rely on propaganda, decontextualized information, and opinions distorted as facts.
- Conspiracy:  
    - Sources that are well-known promoters of kooky conspiracy theories.
- Rumor:
    - ???
- State News:           state	    
    - Sources in repressive states operating under government sanction.
- Junksci:	    
    - Sources that promote pseudoscience, metaphysics, naturalistic fallacies, and other scientifically dubious claims.
- Hate:        
    - Sources that actively promote racism, misogyny, homophobia, and other forms of discrimination.
- Clickbait:   
    - Sources that provide generally credible content, but use exaggerated, misleading, or questionable headlines, social media descriptions, and/or images.
- Unreliable:  
    - Sources that may be reliable but whose contents require further verification.
- Political:   
    - Sources that provide generally verifiable information in support of certain points of view or political orientations.
- Reliable:    
    - Sources that circulate news and information in a manner consistent with traditional and ethical practices in journalism (Remember: even credible sources sometimes rely on clickbait-style headlines or occasionally make mistakes. No news organization is perfect, which is why a healthy news diet consists of multiple sources of information).

In [199]:
# 1 = fake groupe
# 0 = not fake groupe
type_dict = {'fake'         : '1',
             'satire'       : '1',
             'bias'         : '1',
             'conspiracy'   : '1',
             'rumor'        : '1',
             'state'        : '0',
             'junksci'      : '1', 
             'hate'         : '1',
             'clickbait'    : '0',
             'unreliable'   : '0',
             'political'    : '0',
             'reliable'     : '0'}

In [200]:
def CountAndPrintTypes():
    df = pd.read_csv('..\\data\\995,000_rows.csv') 
    df.type.value_counts()

# CountAndPrintTypes()

In [201]:
def RemoveUnknownTypes(input, trash_out, output):
    df = pd.read_csv(input)
    indexes = []
    rows = []
    for i in range(len(df)):
        if not (df.loc[i, "type"] in type_dict.keys()):
            indexes += [i]
            rows += [df.loc[i]]
            
    trash = pd.DataFrame(rows)
    trash.to_csv(trash_out, index=False)
    
    df.drop(index=indexes, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv(output, index=False)
    
RemoveUnknownTypes('..\\data\\995,000_rows.csv', '..\\data\\trash.csv',  '..\\data\\995,000_rows-trash.csv')

  df = pd.read_csv(input)


In [206]:
def XyData(csv_path, X_path, y_path):
    #usecols = ['type', 'domain', 'url','authors', 'title','content']
    usecols = ['type', 'url', 'content']
    df = pd.read_csv(csv_path, usecols=usecols)
    
    df.dropna(subset=['content'], inplace=True) #remove articles without content
    df.reset_index(drop=True, inplace=True) #reset index
    
    y = pd.DataFrame(df['type'].replace(type_dict))
    
    # f = lambda x : 'FilledIn' if x == True else 'Empty'
    # df['domain'] = df['domain'].notna().apply(f)
    # df['url'] = df['url'].notna().apply(f)
    # df['authors'] = df['authors'].notna().apply(f)
    # df['title'] = df['title'].notna().apply(f)
    
    df['url'].fillna(value = '')
    df.drop(columns=['type'], inplace=True)
    
    X = df
         
    X.to_csv(X_path, index=False) 
    y.to_csv(y_path, index=False)

XyData('..\\data\\995,000_rows-trash.csv', '..\\data\\X_995k.csv', '..\\data\\y_995k.csv')

In [207]:
def Split(X_path, y_path):
    df_X = pd.read_csv(X_path)
    df_y = pd.read_csv(y_path)
    X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                        df_y,
                                                        train_size = 0.8,
                                                        random_state = 33,
                                                        shuffle = True)

    X_test, X_val, y_test, y_val = train_test_split(X_test,
                                                    y_test,
                                                    test_size= 0.5,
                                                    random_state = 9,
                                                    shuffle = True)

    return X_train, X_test, X_val, y_train, y_test, y_val

X_train, X_test, X_val, y_train, y_test, y_val = Split('..\\data\\X_995k.csv', '..\\data\\y_995k.csv')

In [208]:
def vevectorizer(train, test, val):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.75)
    tf_vectorizer = TfidfVectorizer()

    # col_vectorizer = make_column_transformer((vectorizer, 'content'),
    #                                          (tf_vectorizer, 'domain'),
    #                                          (tf_vectorizer, 'url'),
    #                                          (tf_vectorizer, 'authors'),
    #                                          (tf_vectorizer, 'title'))
    
    col_vectorizer = make_column_transformer((vectorizer, 'content'),
                                             (vectorizer, 'url'))

    X_train_vec = col_vectorizer.fit_transform(train)
    X_test_vec = col_vectorizer.transform(test)
    X_val_vec = col_vectorizer.transform(val)
    
    return X_train_vec, X_test_vec, X_val_vec

X_train_vec, X_test_vec, X_val_vec = vevectorizer(X_train, X_test, X_val)

In [209]:
model = LinearSVC(max_iter=1000, dual='auto')
model.fit(X_train_vec, y_train.type)

#model.score(X_test_vec, y_test)

y_pred = model.predict(X_test_vec)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {score}')

Accuracy: 0.999933604815864


In [210]:
model.score(X_val_vec, y_val)

0.999900407223796

In [211]:
# confusion_matrix(y_test,y_pred, labels=[1,0])

In [212]:
#=======================================================
#K-fold
#=======================================================

# df_X = pd.read_csv(..\\data\\X_995k.csv) 
# df_y = pd.read_csv(..\\data\\X_995k.csv)

# X = vectorizer.transform(df_X.content)
# y = df_y.type.values

# scores = cross_val_score(model, X, y, cv=5)
# print(f'K Fold Accuracy: {score}%')

In [213]:
# import pickle
# filename = 'finalized_model.sav'
# pickle.dump(model, open(filename, 'wb'))
# # some time later...
# loaded_model = pickle.load(open(filename, 'rb'))
# loaded_model.score(X_test_vec, y_test)


# import joblib
# filename = 'finalized_model.sav'
# joblib.dump(model, filename)
# # some time later...
# loaded_model = joblib.load(filename)
# loaded_model.score(X_test_vec, y_test)

In [214]:
# model = PassiveAggressiveClassifier(max_iter=100)
# model.fit(X_train_vec, y_train)

# #model.score(X_test_vec, y_test)

# y_pred = model.predict(X_test_vec)
# score = accuracy_score(y_test,y_pred)
# print(f'Accuracy: {score}')