In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

from jjuoda_dl4.utils import make_train_dataframes, BASE_DATA_DIR

RANDOM_SEED = 42
pd.options.display.max_colwidth = None

In [3]:
nela_gt_2018_articles_df = pd.read_csv(
    BASE_DATA_DIR / "interim/nela-gt-2018-articles.csv", index_col=0
)
nela_gt_2018_scores_df = pd.read_csv(
    BASE_DATA_DIR / "interim/nela-gt-2018-scores.csv", index_col=0
)

In [4]:
nela_gt_2018_articles_df["is_fake"] = nela_gt_2018_articles_df["source_score"] < 0

### Difirentiating real news from fake news

In [5]:
def make_equal_split(articles_df, scores, n_articles=10, seed=RANDOM_SEED):
    """Get an equal sample of articles from sources with particular scores"""
    sample = []
    for score in scores:
        sample.append(
            articles_df[articles_df.source_score == score].sample(
                n_articles, random_state=seed
            )
        )
    # shuffle the samples so different scores are not always in the same order
    return pd.concat(sample).sample(frac=1, random_state=RANDOM_SEED)

In [6]:
should_be_clear = make_equal_split(nela_gt_2018_articles_df, [-2, 2])
should_be_clear.title.str.lower()

108150                              whats in your water 6 reasons why you should never drink from the tap
494567                                     obama endorses long list of candidates ahead of 2018 elections
548544    flight mh370 new documentary promises to aposroll back the wavesapos to solve mystery of lost a
182303                                 president orders pentagon to create space force branch of military
16943                                   trump defends decision to congratulate putin attacks crazed media
238642                                          lava from hawaii volcano enters ocean creates toxic cloud
448130                                  don snow trump channels game of thrones with sanctions are coming
238627                                                                           robots grow human organs
433987                                                       why you shouldnt obsess about overpopulation
620496                                       p

In [7]:
julius_guess = np.array(
    [1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0], dtype=bool
)
np.mean(julius_guess)

0.4

Hm, so I'm biased to believe news are true. I know the split is 50/50 :D

In [8]:
print(
    "Julius's accuracy: {:.2f}%".format(
        np.mean(julius_guess == should_be_clear.is_fake) * 100
    )
)

Julius's accuracy: 40.00%


In [9]:
should_be_clear["julius_guess"] = julius_guess
should_be_clear[["title", "julius_guess", "is_fake", "source"]]

Unnamed: 0,title,julius_guess,is_fake,source
108150,Whats in your water 6 Reasons why you should never drink from the tap,True,True,Natural News
494567,Obama Endorses Long List Of Candidates Ahead Of 2018 Elections,False,False,Talking Points Memo
548544,Flight MH370 New documentary promises to aposroll back the wavesapos to solve mystery of lost a,True,False,The Independent
182303,President orders Pentagon to create space force branch of military,False,True,Drudge Report
16943,Trump defends decision to congratulate Putin attacks crazed media,True,True,Daily Mail
238642,Lava from Hawaii volcano enters ocean creates toxic cloud,False,True,Drudge Report
448130,Don Snow Trump Channels Game Of Thrones With Sanctions Are Coming,True,False,Talking Points Memo
238627,ROBOTS GROW HUMAN ORGANS,False,True,Drudge Report
433987,Why you shouldnt obsess about overpopulation,False,False,Vox
620496,President Trump Is Helping Make Chinese Research Great Again,True,False,Fortune


Ok, my accuracy is slightly lower than expected from random guessing. Title might not be enough. Let's try again with another seed.

In [10]:
should_be_clear_2 = make_equal_split(nela_gt_2018_articles_df, [-2, 2], seed=2)
should_be_clear_2.title.str.lower()

88615                                        president trump confirms 2020 re-election bid on drudge report
141284              parkland survivors tell off florida politicians weve had enough of thoughts and prayers
572577                                    democrats activists rally against trumps family separation policy
229944                        obama prepares for the campaign trail but some democrats want him to back off
368360                     trump will sanction more airstrikes against syria if assad uses chemical weapons
567855                                                          erie pa ties snow season record more on way
628972          mumsnet and facebook putting women off natural birth with apostsunami of horror storiesapos
380575                                  trumps lawyer to reveal how president conspired to corrupt election
477195                               meghan markles pregnancy may be further along than previously reported
278354                      

In [11]:
julius_guess_2 = np.array(
    [1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1], dtype=bool
)
julius_guess_2.mean()

0.65

In [12]:
(julius_guess_2 == should_be_clear_2.is_fake).mean()

0.65

In [13]:
ugnius_guess_2 = np.array(
    [1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1], dtype=bool
)
(ugnius_guess_2 == should_be_clear_2.is_fake).mean()

0.55

In [16]:
should_be_clear_2["julius_guess"] = julius_guess_2
should_be_clear_2["ugnius_guess"] = ugnius_guess_2
should_be_clear_2[["title", "julius_guess", "ugnius_guess", "is_fake", "source"]]

Unnamed: 0,title,julius_guess,ugnius_guess,is_fake,source
88615,President Trump Confirms 2020 Re-Election Bid on Drudge Report,True,True,True,The Political Insider
141284,Parkland Survivors Tell Off Florida Politicians Weve Had Enough of Thoughts and Prayers,True,False,False,Daily Beast
572577,Democrats activists rally against Trumps family separation policy,False,False,False,CNBC
229944,Obama Prepares For The Campaign Trail But Some Democrats Want Him To Back Off,True,False,True,The Political Insider
368360,Trump will sanction MORE airstrikes against Syria if Assad uses chemical weapons,False,False,True,Daily Mail
567855,Erie PA Ties Snow Season Record More On Way,False,True,True,Drudge Report
628972,Mumsnet and Facebook putting women off natural birth with apostsunami of horror storiesapos,True,True,False,The Independent
380575,Trumps lawyer to reveal how President conspired to corrupt election,True,False,True,Daily Mail
477195,Meghan Markles pregnancy may be further along than previously reported,True,True,False,Mercury News
278354,Brexit deal will be agreed within three weeks Dominic Raab tells MPs,False,True,False,The Independent


It's hilarious that my accuracy equals to proportion that I guessed of news being fake. Still, it means the task of distinguishing reliable sources from unreliable sources is hard, when all you have is the title.