In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import os
import tempfile
import numpy as np
import pandas as pd
from faculty import datasets

from bias_classifier import BiasClassifier

In [None]:
file_paths = datasets.ls('/input/bias-data/')[1:]
TMP_CSV = '/tmp/tmp.csv'
print(file_paths)

In [None]:
if os.path.exists(TMP_CSV):
    os.remove(TMP_CSV)
df_list = []
for path in file_paths:
    datasets.get(path, TMP_CSV)
    df_list.append(pd.read_csv(TMP_CSV, sep='\t', encoding='utf-8'))
    if os.path.exists(TMP_CSV):
        os.remove(TMP_CSV)
df = pd.concat(df_list) \
    .drop_duplicates(subset='article_url') \
    .reset_index(drop=True)

df.bias = df.bias.fillna(0)
bias = { 0: 0, 'left': 1, 'right': 2 }
df.bias = [bias[item] for item in df.bias]
df.shape

In [None]:
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]
print(len(train))
print(len(test))
train.to_csv('/tmp/train.csv', sep='\t', encoding='utf-8', index=False)
test.to_csv('/tmp/test.csv', sep='\t', encoding='utf-8', index=False)

In [None]:
clf = BiasClassifier(
    train_data='/tmp/train.csv',
    dump=True,
    debug=True
)

In [None]:
to_classify = pd.read_csv('/tmp/test.csv', sep='\t', encoding='utf-8')
classified_data = clf.classify(to_classify)
output = pd.DataFrame(classified_data)

In [None]:
output

In [None]:
correct_left = 0
incorrect_left = 0
count_left = 0
correct_right = 0
incorrect_right = 0
count_right = 0

for index, row in output[output.bias == 1].iterrows():
    count_left += 1
    if row.bias_prediction >= 0:
        incorrect_left += 1
    elif row.bias_prediction <= 0:
        correct_left += 1

for index, row in output[output.bias == 2].iterrows():
    count_right += 1
    if row.bias_prediction >= 0:
        incorrect_right += 1
    elif row.bias_prediction <= 0:
        correct_right += 1
        
# for index, row in output[output.bias == 0].iterrows():
#     print(row.bias_prediction)
            
precision_left = correct_left / (correct_left + incorrect_right)
precision_right = correct_right / (correct_right + incorrect_left)

recall_left = correct_left / (correct_left + incorrect_left)
recall_right = correct_right / (correct_right + incorrect_right)

print('Left')
print('Precision: ' + str(precision_left))
print('Recall: ' + str(recall_left))
print('Right')
print('Precision: ' + str(precision_right))
print('Recall: ' + str(recall_right))