In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from datetime import datetime
%matplotlib inline
import matplotlib
from datetime import datetime
import os
from scipy import stats

from definitions import HUMAN_DATA_DIR, ROOT_DIR
from data.load_from_csv import get_content_datasets

In [2]:
def ClairvoyantCF(test_dataset, train_dataset, answers_dict):
    """Takes datasets and {item_id: True/False} dict and returns
    mean mse simply predicting 0/100"""
    total_score = 0
    for i, rating in enumerate(test_dataset.ratings):
        try:
            if answers_dict[test_dataset.item_ids[i]]:
                total_score += (rating[2] - 1.0)**2
            else:
                total_score += (rating[2] - 0)**2
        except:
            print(i, test_dataset.item_ids[i])
    mean_mse = total_score / len(test_dataset.ratings)
    print("Using Clairvoyant CF, got total val score {:.3f}".format(mean_mse))
    return

def ClairvoyantAdjustedCF(test_dataset, train_dataset, answers_dict):
    """Takes datasets and {item_id: True/False} dict and returns
    mean mse simply predicting 0/100"""
    tot_true = 0
    tot_false = 0
    true_count = 0
    false_count = 0
    
    for i, rating in enumerate(train_dataset.ratings):
        if not np.isnan(rating[2]):
            if answers_dict[train_dataset.item_ids[i]]:
                tot_true += rating[2]
                true_count += 1
            else:
                tot_false += rating[2]
                false_count += 1            
    avg_true = tot_true / true_count
    avg_false = tot_false / false_count
    
    total_score = 0
    for i, rating in enumerate(test_dataset.ratings):
        if answers_dict[test_dataset.item_ids[i]]:
            total_score += (rating[2] - avg_true)**2
        else:
            total_score += (rating[2] - avg_false)**2
    mean_mse = total_score / len(test_dataset.ratings)
    print("Using Clairvoyant Adjusted CF, got total val score {:.3f}".format(mean_mse))
    return

In [3]:
fermi_answers = pd.read_csv(os.path.join(HUMAN_DATA_DIR, 'fermi', 'answers.csv')).drop('Unnamed: 0', axis=1).set_index('item_id').T.to_dict('index')['answer']
politifact_answers = pd.read_csv(os.path.join(HUMAN_DATA_DIR, 'politifact', 'answers.csv')).drop('Unnamed: 0', axis=1).set_index('item_id').T.to_dict('index')['answer']

In [4]:
## Fermi 
print('Fermi\nUnmasked:')
unmasked_fermi, unmasked_val_fermi, _ = get_content_datasets(task='fermi', sparsity='unmasked')
ClairvoyantCF(unmasked_val_fermi, unmasked_fermi, fermi_answers)
ClairvoyantAdjustedCF(unmasked_val_fermi, unmasked_fermi, fermi_answers)
print('\nLight Masking:')
light_fermi, unmasked_val_fermi, _ = get_content_datasets(task='fermi', sparsity='light')
ClairvoyantCF(unmasked_val_fermi, light_fermi, fermi_answers)
ClairvoyantAdjustedCF(unmasked_val_fermi, light_fermi, fermi_answers)
print('\nHeavy Masking:')
heavy_fermi, unmasked_val_fermi, _ = get_content_datasets(task='fermi', sparsity='heavy')
ClairvoyantCF(unmasked_val_fermi, heavy_fermi, fermi_answers)
ClairvoyantAdjustedCF(unmasked_val_fermi, heavy_fermi, fermi_answers)

Fermi
Unmasked:
Using Clairvoyant CF, got total val score 0.216
Using Clairvoyant Adjusted CF, got total val score 0.111

Light Masking:
Using Clairvoyant CF, got total val score 0.216
Using Clairvoyant Adjusted CF, got total val score 0.111

Heavy Masking:
Using Clairvoyant CF, got total val score 0.216
Using Clairvoyant Adjusted CF, got total val score 0.111


In [5]:
## Politifact
print('Politifact\nUnmasked:')
unmasked_politifact, unmasked_val_politifact, _ = get_content_datasets(task='politifact', sparsity='unmasked')
ClairvoyantCF(unmasked_val_politifact, unmasked_politifact, politifact_answers)
ClairvoyantAdjustedCF(unmasked_val_politifact, unmasked_politifact, politifact_answers)
print('\nPolitifact Masking:')
light_politifact, unmasked_val_politifact, _ = get_content_datasets(task='politifact', sparsity='light')
ClairvoyantCF(unmasked_val_politifact, light_politifact, politifact_answers)
ClairvoyantAdjustedCF(unmasked_val_politifact, light_politifact, politifact_answers)
print('\nPolitifact Masking:')
heavy_politifact, unmasked_val_politifact, _ = get_content_datasets(task='politifact', sparsity='heavy')
ClairvoyantCF(unmasked_val_politifact, heavy_politifact, politifact_answers)
ClairvoyantAdjustedCF(unmasked_val_politifact, heavy_politifact, politifact_answers)

Politifact
Unmasked:
Loading w2v dict
Loaded Word2Vec dict: 27.13s
Number of words in corpus: 400001
Using Clairvoyant CF, got total val score 0.242
Using Clairvoyant Adjusted CF, got total val score 0.112

Politifact Masking:
Loading w2v dict
Loaded Word2Vec dict: 23.40s
Number of words in corpus: 400001
Using Clairvoyant CF, got total val score 0.242
Using Clairvoyant Adjusted CF, got total val score 0.112

Politifact Masking:
Loading w2v dict
Loaded Word2Vec dict: 18.82s
Number of words in corpus: 400001
Using Clairvoyant CF, got total val score 0.242
Using Clairvoyant Adjusted CF, got total val score 0.112
