In [1]:
%load_ext autoreload
%autoreload 2

In [40]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [41]:
import os
import tempfile
import numpy as np
import pandas as pd
import sherlockml.filesystem as sfs

from bias_classifier import BiasClassifier

In [57]:
file_paths = sfs.ls('/input/bias-data/')[1:]
TMP_CSV = '/tmp/tmp.csv'
print(file_paths)

['/input/bias-data/2018-08-19.csv', '/input/bias-data/2018-08-20.csv', '/input/bias-data/2018-08-21.csv']


In [58]:
if os.path.exists(TMP_CSV):
    os.remove(TMP_CSV)
df_list = []
for path in file_paths:
    sfs.get(path, TMP_CSV)
    df_list.append(pd.read_csv(TMP_CSV, sep='\t', encoding='utf-8'))
    if os.path.exists(TMP_CSV):
        os.remove(TMP_CSV)
df = pd.concat(df_list) \
    .drop_duplicates(subset='article_url') \
    .reset_index(drop=True)

df.bias = df.bias.fillna(0)
bias = { 0: 0, 'left': 1, 'right': 2 }
df.bias = [bias[item] for item in df.bias]
df.shape

(1877, 7)

In [59]:
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]
print(len(train))
print(len(test))
train.to_csv('/tmp/train.csv', sep='\t', encoding='utf-8', index=False)
test.to_csv('/tmp/test.csv', sep='\t', encoding='utf-8', index=False)

1523
354


In [60]:
clf = BiasClassifier(
    train_data='/tmp/train.csv',
    dump=True,
    debug=True
)

In [61]:
to_classify = pd.read_csv('/tmp/test.csv', sep='\t', encoding='utf-8')
classified_data = clf.classify(to_classify)
output = pd.DataFrame(classified_data)

In [62]:
output

Unnamed: 0,article_content,article_title,article_url,bias,bias_level,source_label,source_name,bias_prediction
0,Vivid Impressions In Chatroulette\n\nAugust 18...,Vivid Impressions In Chatroulette,https://hangthebankers.com/vivid-impressions-i...,2,4,fake,Hang The Bankers,0.672920
1,"What Is the Price of ProbioSlim?\n\nAugust 10,...",What Is the Price of ProbioSlim?,https://hangthebankers.com/what-is-the-price-o...,2,4,fake,Hang The Bankers,-0.651062
2,"The Very Best High-Protein Foods\n\nJuly 14, 2...",The Very Best High-Protein Foods,https://hangthebankers.com/the-very-best-high-...,2,4,fake,Hang The Bankers,0.583983
3,Did You Know that Most Trading Strategies Were...,Did You Know that Most Trading Strategies Were...,https://hangthebankers.com/did-you-know-that-m...,2,4,fake,Hang The Bankers,0.529620
4,Tweet\n \n\n\n\n Reddit\n\n\n\n\n\n\n\nBy Ashl...,Twitter Speeds Up Push Against ‘Dehumanizing S...,https://www.newsbusters.org/blogs/culture/ashl...,2,4,fake,News Busters,0.789152
5,Tweet\n \n\n\n\n Reddit\n\n\n\n\n\n\n\nBy Ashl...,Google Is Reportedly Planning a Censored Searc...,https://www.newsbusters.org/blogs/culture/ashl...,2,4,fake,News Busters,0.746337
6,Tweet\n \n\n\n\n Reddit\n\n\n\n\n\n\n\nBy Ashl...,Facebook Removes 32 Pages For ‘Coordinated Ina...,https://www.newsbusters.org/blogs/culture/ashl...,2,4,fake,News Busters,0.851903
7,Tweet\n \n\n\n\n Reddit\n\n\n\n\n\n\n\nBy NB S...,Brent Bozell Slams Twitter for ‘Shadow Banning...,https://www.newsbusters.org/blogs/nb/nb-staff/...,2,4,fake,News Busters,0.843972
8,Tweet\n \n\n\n\n Reddit\n\n\n\n\n\n\n\nBy Ashl...,Iranian Teenager Arrested for Dancing in Insta...,https://www.newsbusters.org/blogs/culture/ashl...,2,4,fake,News Busters,0.875300
9,Tweet\n \n\n\n\n Reddit\n\n\n\n\n\n\n\nBy Ashl...,Seth Rogen Claims Twitter Verifies White Supre...,https://www.newsbusters.org/blogs/culture/ashl...,2,4,fake,News Busters,0.867477


In [72]:
correct_left = 0
incorrect_left = 0
count_left = 0
correct_right = 0
incorrect_right = 0
count_right = 0

for index, row in output[output.bias == 1].iterrows():
    count_left += 1
    if row.bias_prediction >= 0:
        incorrect_left += 1
    elif row.bias_prediction <= 0:
        correct_left += 1

for index, row in output[output.bias == 2].iterrows():
    count_right += 1
    if row.bias_prediction >= 0:
        incorrect_right += 1
    elif row.bias_prediction <= 0:
        correct_right += 1
        
# for index, row in output[output.bias == 0].iterrows():
#     print(row.bias_prediction)
            
precision_left = correct_left / (correct_left + incorrect_right)
precision_right = correct_right / (correct_right + incorrect_left)

recall_left = correct_left / (correct_left + incorrect_left)
recall_right = correct_right / (correct_right + incorrect_right)

print('Left')
print('Precision: ' + str(precision_left))
print('Recall: ' + str(recall_left))
print('Right')
print('Precision: ' + str(precision_right))
print('Recall: ' + str(recall_right))

Left
Precision: 0.40594059405940597
Recall: 0.7256637168141593
Right
Precision: 0.11428571428571428
Recall: 0.03225806451612903
