# Facts: collecting datasets


In [None]:
#@title

!pip -q install shap

import os
from functools import reduce

import numpy as np
import pandas as pd
import tensorflow as tf
import shap
from sklearn.preprocessing import LabelEncoder, StandardScaler

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from google.colab import drive

sns.set(palette=sns.color_palette("hls", 8))
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

drive.mount('/content/drive')

In [None]:
SEED = 180120342
OUTPUT_DATASET = '/content/drive/My Drive/datasets/facts.csv'

rs = np.random.RandomState(SEED)

#### 20 News Groups

In [None]:
from sklearn.datasets import fetch_20newsgroups

x = fetch_20newsgroups(subset='train', random_state=rs)
t = fetch_20newsgroups(subset='test', random_state=rs)
labels = np.asarray(x.target_names)

x = pd.DataFrame({'text': x.data, 'category': labels[x.target], 'stage': 'train'})
xt = pd.DataFrame({'text': t.data, 'category': labels[t.target], 'stage': 'test'})
d20newsgroups = x.append(xt).assign(target='true').assign(source='20newsgroups')

d20newsgroups.head(1)

In [None]:
print('samples:', len(d20newsgroups))
print(dict(zip(*np.unique(d20newsgroups.stage, return_counts=True))))
d20newsgroups.describe()

#### Fake and real news dataset

[fake-and-real-news-dataset](https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset)

In [None]:
DATASET = ('/content/drive/My Drive/Colab Notebooks/ml-notes/'
           'datasets/572515_1037534_bundle_archive.zip')

In [None]:
import zipfile
from sklearn.model_selection import train_test_split

with zipfile.ZipFile(DATASET) as z:
    z.extractall('./ds/')

t, f = (pd.read_csv('./ds/True.csv'),
        pd.read_csv('./ds/Fake.csv'))

def preprocess(t, f):
    t['target'] = 'true'
    f['target'] = 'fake'

    x = t.append(f)

    ds = x.date.str.strip()
    x['created_at'] = pd.to_datetime(ds, format='%B %d, %Y', errors='coerce')
    x.loc[x.created_at.isnull(), 'created_at'] = pd.to_datetime(ds[x.created_at.isnull()], format='%b %d, %Y', errors='coerce')
    x.loc[x.created_at.isnull(), 'created_at'] = pd.to_datetime(ds[x.created_at.isnull()], format='%d-%b-%y', errors='coerce')

    return x.dropna().rename(columns={'subject': 'category'}).drop(columns=['date'])

fake_and_real_news = preprocess(t, f).assign(source='fake_and_real_news')
fake_and_real_news.sort_values('created_at')
middle = fake_and_real_news.iloc[len(fake_and_real_news) // 2].created_at
fake_and_real_news['stage'] = 'train'
fake_and_real_news.loc[fake_and_real_news.created_at >= middle, 'stage'] = 'test'

fake_and_real_news = fake_and_real_news.drop(columns=['created_at'])

fake_and_real_news.head(1)

In [None]:
print('samples:', len(fake_and_real_news))
print(dict(zip(*np.unique(fake_and_real_news.stage, return_counts=True))))
fake_and_real_news.describe()

#### Fake News

https://www.kaggle.com/c/fake-news/data?select=train.csv

In [None]:
DATASET_DIR = '/content/drive/My Drive/datasets/fake-news'

In [None]:
with zipfile.ZipFile(DATASET_DIR + '/train.csv.zip') as z:
    z.extractall('./dsfn/')

with zipfile.ZipFile(DATASET_DIR + '/test.csv.zip') as z:
    z.extractall('./dsfn/')

x, t = (pd.read_csv('./dsfn/train.csv').assign(stage='train'),
        pd.read_csv('./dsfn/test.csv').assign(stage='test'))

s = pd.read_csv(DATASET_DIR + '/submit.csv')

t = t.merge(s, how='left', on='id')

fake_news_2 = x.append(t)
fake_news_2.loc[fake_news_2.label == 1, 'label'] = 'unreliable'
fake_news_2.loc[fake_news_2.label == 0, 'label'] = 'reliable'

fake_news_2 = (fake_news_2
               .rename(columns={'label': 'target'})
               .drop(columns=['id'])
               .assign(source='kg_fake_news_2'))

In [None]:
x.shape, t.shape, dict(zip(*np.unique(fake_news_2.stage, return_counts=True)))

In [None]:
fake_news_2.head(1)

#### Saving

In [None]:
all_sets = [
    d20newsgroups,
    fake_and_real_news,
    fake_news_2
]

In [None]:
all_columns = reduce(lambda a, b: a | b, [set(s.columns) for s in all_sets])

for s in all_sets:
    for c in all_columns - set(s.columns):
        s[c] = None

In [None]:
x = pd.concat(all_sets)
x.head(1)

In [None]:
print('samples:', len(x))
print('Null values:')
print(x.isnull().mean().round(2))

x.describe()

In [None]:
x.to_csv(OUTPUT_DATASET, index=False)