In [4]:
# autoreload imports
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Misinformation detection

In [5]:
# standard path wrangling to be able to import project config and sources
import os
import sys
root = os.path.dirname(os.getcwd())
sys.path.append(root)

In [6]:
# built-in
import json
import logging
import warnings
from datetime import datetime

# installed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# custom
from src.utils.jupyter_setup import setup_jupyter
from src.utils.rand_utils import RandUtils
from src.utils.config_loader import ConfigLoader
from src.utils.file_utils import FileUtils
from src.data.path_manager import PathManager
from src.data.data_loader import DataLoader
from src.data.dataset_wrapper import DatasetWrapper
from src.data.data_splitter import DataSplitter
from src.evaluation.metrics import Metrics
from src.features.bow_vectorizer import BowVectorizer

# models
from src.models.mnb import MNB
from src.models.bnb import BNB
from src.models.svm import SVM
from src.models.lr import LR
from src.models.gb import GB
from src.models.cnn import CNN
from src.models.rnn import RNN
from src.models.lstm import LSTM

# trainers
from src.models.lr_torch import LRTorch
from src.trainers.lr_trainer import LRTrainer
from src.trainers.cnn_trainer import CNNTrainer
from src.trainers.rnn_trainer import RNNTrainer
from src.trainers.lstm_trainer import LSTMTrainer

In [7]:
cfg = setup_jupyter(root, logging_level=logging.WARNING)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/PaxtonEdgar/Documents/InfEco/COVIDmisinfoBursts/Original_misinfo/config.json'

## Load data

In [None]:
path_mgr = PathManager(cfg, root)
data_df = pd.read_csv(path_mgr.processed_file_path('fc_cleaned.csv'))
print('Documents retrieved:', len(data_df))

Set random seed

In [None]:
RandUtils.set_random_seed(0)

Prepare data

In [None]:
# construct X and y arrays from true and false labels
valid_labels = ['true', 'false/misleading']
label_filter = data_df['fact_new'].isin(valid_labels)
X = data_df[label_filter]['subject']
y = np.array([valid_labels.index(topic) for topic in data_df[label_filter]['fact_new']])

# make sure positive and negative classes are balanced
max_cls_count = sum(y == 0)
random_indices = np.random.choice(sum(y == 1), max_cls_count, replace=False)
X_misinfo = list(X[y == 1])
y_misinfo = list(y[y == 1])
X = np.concatenate((X[y == 0], [X_misinfo[i] for i in random_indices]))
y = np.concatenate((y[y == 0], [y_misinfo[i] for i in random_indices]))
print(X.shape, y.shape)

Generate splits

In [None]:
ds = DataSplitter(n_splits=5, random_state=0)
splits = {idx: split for idx, split in enumerate(ds.get_split_ids(X, y))}

## Baseline models

In [None]:
models = [
    'models/config/gb/gb_def_count_2grams.json',
    'models/config/lr/lr_l1_tfidf_2grams.json',
    'models/config/lr/lr_l2_tfidf_2grams.json',
    'models/config/nb/nb_def_binary_2grams.json',
    'models/config/svm/svm_def_binary_2grams.json',
    'models/config/svm/svm_def_count_2grams.json',
    'models/config/svm/svm_def_tfidf_2grams.json'
]

Test baseline models

In [None]:
all_results = {}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    for cfg_file in models:

        model_cfg = ConfigLoader.load_config(cfg_file)
        print('Processing', model_cfg['name'])

        vectorizer_cls = model_cfg['vectorizer_class']
        vectorizer = eval(vectorizer_cls)(
            tokenizer_cfg=model_cfg['tokenizer'],
            vectorizer_cfg=model_cfg['vectorizer'],
            tfidf=model_cfg['tfidf']
        )

        results = {}
        for split_key, split in sorted(splits.items()):
            # split data
            train_ids = list(split['train']) + list(split['dev'])
            test_ids = list(split['test'])
            X_train, X_test = X[train_ids], X[test_ids]
            y_train, y_test = y[train_ids], y[test_ids]
            # vectorize data
            X_train, X_test = vectorizer.vectorize(X_train, X_test)
            y_train = np.array(y_train)
            y_test = np.array(y_test)
            # train & test model
            model = eval(model_cfg['model_class'])(model_cfg['model'])
            results[split_key] = model.train_test(X_train, X_test, y_train, y_test)

        all_results[model_cfg['name']] = {
            k: v for k, v in Metrics.average_results(results)['split_avg'].items()
            if k.startswith('test_')
        }

## Deep learning models

In [None]:
models = [
    'models/config/cnn/cnn_100d_234x100.json',
    'models/config/rnn/rnn_100d_1x32x1.json',
    'models/config/rnn/rnn_100d_1x32x2.json',
    'models/config/lstm/lstm_100d_1x32x1.json',
    'models/config/lstm/lstm_100d_1x32x2.json'
]

In [None]:
def correct_paths(model_cfg, root):
    if model_cfg['network']['pretrained_embeddings'] is not None:
        cache_path = os.path.join(root, model_cfg['dataloader_params']['vector_cache'])
        embed_path = os.path.join(root, model_cfg['network']['pretrained_embeddings'])
        model_cfg['network']['pretrained_embeddings'] = embed_path
        model_cfg['dataloader_params']['embeddings_path'] = embed_path
        model_cfg['dataloader_params']['vector_cache'] = cache_path
    model_cfg['save_directory'] = os.path.join(root, model_cfg['save_directory'])
    return model_cfg

In [None]:
best_models = {}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    for cfg_file in models:

        model_cfg = ConfigLoader.load_config(cfg_file)
        print('Processing', model_cfg['name'])
        
        if model_cfg['name'] not in best_models:
            best_models[model_cfg['name']] = {}
        
        # correct paths
        model_cfg = correct_paths(model_cfg, root)
        
        # create dated directory for saving model
        save_directory = FileUtils.mkdir_timed(model_cfg['save_directory'], datetime.now(), model_cfg['name'])

        results = {}
        for split_key, split in sorted(splits.items()):
            if model_cfg['resume_from'] is None:
                curr_dir = os.path.join(save_directory, f'{split_key}')
                model_cfg['save_directory'] = curr_dir
            # split data
            train_loader, dev_loader, test_loader, vocab = DatasetWrapper.iters(
                X, y, split, **model_cfg['dataloader_params']
            )
            # train & test model
            model_cls = eval(model_cfg['model_class'])
            trainer_cls = eval(model_cfg['trainer_class'])
            model = model_cls(model_cfg, vocab)
            model.summary()
            trainer = trainer_cls.initialize(
                model, model_cfg, model_cfg['resume_from']
            )
            trainer.train(train_loader, dev_loader)

            # evaluate
            best_model_path = trainer.best_model_path()
            best_models[model_cfg['name']][split_key] = best_model_path
            model = model_cls(model_cfg, vocab)
            evaluator = trainer_cls.initialize(model, model_cfg, best_model_path)
            split_results = {'train_{}'.format(k): v for k, v in evaluator.evaluate(train_loader)[0].items()}
            split_results.update({'val_{}'.format(k): v for k, v in evaluator.evaluate(dev_loader)[0].items()})
            split_results.update({'test_{}'.format(k): v for k, v in evaluator.evaluate(test_loader)[0].items()})
            split_results['epoch'] = evaluator.checkpoint_epoch
            results[split_key] = split_results

        all_results[model_cfg['name']] = {
            k: v for k, v in Metrics.average_results(results)['split_avg'].items()
            if k.startswith('test_')
        }

In [None]:
pd.DataFrame.from_dict(all_results, orient='index')

In [None]:
# uncomment for report
# for m_name, m_res in all_results.items():
#     print(f'{m_name} & {m_res["test_precision"]:.4f} & {m_res["test_recall"]:.4f} & {m_res["test_f_score"]:.4f} & {m_res["test_accuracy"]/100:.4f} \\\\')

## Test best model

In [None]:
split_id = 3
model_cfg_pth = 'models/config/cnn/cnn_100d_234x100.json'
model_best = best_models['cnn_100d_234x100'][split_id]
print(f'Best model path:\n{model_best}')
model_cfg = ConfigLoader.load_config(model_cfg_pth)

In [None]:
# prepare test data
# 0 -- true, 1 -- false/misleading
test_data = [
    (1, 'Flu vaccine are the main reason for COVID-19.'), 
    (1, 'West loses race to develop COVID-19 vaccine. The Russian vaccine against COVID-19 is ready.'),
    (0, 'CDC says to wear two masks to help flatten the case curve.')
]

y_test, X_test = zip(*test_data)

y_test = np.array(y_test)
X_test = np.array(X_test)

np.info(y_test)

print(X_test.shape, y_test.shape)

In [18]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    print('Processing', model_cfg['name'])

    # turn off saving
    model_cfg['save_best'] = False

    # correct paths
    model_cfg = correct_paths(model_cfg, root)

    # create dated directory for saving model
    save_directory = FileUtils.mkdir_timed(model_cfg['save_directory'], datetime.now(), model_cfg['name'])
    
    # use original vocab
    _, _, _, vocab = DatasetWrapper.iters(X, y, splits[split_id], **model_cfg['dataloader_params'])

    # split data
    split = {'train': [0, 1, 2], 'dev': [0, 1, 2], 'test': [0, 1, 2]}
    train_loader, dev_loader, test_loader, _ = DatasetWrapper.iters(
        X_test, y_test, split, **model_cfg['dataloader_params']
    )

    # load model
    model_cls = eval(model_cfg['model_class'])
    trainer_cls = eval(model_cfg['trainer_class']) 
    print(f'Using {model_cls.__name__}, {trainer_cls.__name__}\n')
    model = model_cls(model_cfg, vocab)
    evaluator = trainer_cls.initialize(model, model_cfg, model_best)
    
    # pass data through model and print results
    print('Results:')
    print('=' * len('results:'))
    results = evaluator.evaluate(test_loader)
    print(json.dumps(results[0], indent=4))
    print('Predictions:')
    print('=' * len('predictions:'))
    for batch in train_loader:
        for text, true, predicted in zip(batch.raw, batch.label, results[1]):
            print(f'Predicted label: {predicted}')
            print(f'True label: {true}')
            print(f'Text: {text}\n')

Processing cnn_100d_234x100
Using CNN, CNNTrainer

Results:
{
    "n_samples": 3,
    "pos_samples": 2,
    "neg_samples": 1,
    "correct": 3,
    "accuracy": 1.0,
    "precision": 1.0,
    "recall": 1.0,
    "f_score": 1.0,
    "tn": 1,
    "tp": 2,
    "fn": 0,
    "fp": 0,
    "loss": 0.40968477725982666
}
Predictions:
Predicted label: 1
True label: 1
Text: West loses race to develop COVID-19 vaccine. The Russian vaccine against COVID-19 is ready.

Predicted label: 0
True label: 0
Text: CDC says to wear two masks to help flatten the case curve.

Predicted label: 1
True label: 1
Text: Flu vaccine are the main reason for COVID-19.

