# NLP Tutorial: Fake News Detection - Bag Of Words (BOW)

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('fake_and_real_news.csv')
df

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real
...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake
9896,Trump consults Republican senators on Fed chie...,Real
9897,Trump lawyers say judge lacks jurisdiction for...,Real
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake


In [5]:
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [6]:
df.shape

(9900, 2)

## Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Text, df.label, test_size=0.2, random_state=1)

In [9]:
X_train.shape

(7920,)

In [10]:
X_test.shape

(1980,)

In [11]:
X_train

5680    U.S. Senate votes near unanimously for Russia,...
9144     Donald Trump Nominates The Only Black Guy He ...
837      ‘Charlottesville 3.0’: Nazis Descend Upon Col...
5140    Republican Senator Alexander to consult on bip...
9568     Federal Judge Rules That Trump DID Incite Vio...
                              ...                        
2895     Trump Demands Apology From The Media And Gets...
7813     REPORT: Trump Will Likely Force US Taxpayers ...
905     Commerce Secretary says Trump-Xi talks will ad...
5192    Trump's legal team spokesman resigns amid repo...
235      Trump Just Tweeted The FBI To Show Them A Fox...
Name: Text, Length: 7920, dtype: object

In [12]:
X_test

4322     WATCH: Fox Legal Analyst DESTROYS Trump For W...
8862    Social media executives to testify Nov. 1 abou...
8024     Canadian Hilariously HUMILIATES Three Trump S...
260     Factbox: Trump plan to dismantle 'Dreamer' pro...
6998    Trump to host Italian prime minister on April ...
                              ...                        
8887    Trump travel ban on more solid ground as top c...
2618    Mexican president to meet with Trump at G20 ME...
608      GOP Rep Blames Media For Trump’s Attacks On S...
3086    No. 2 Senate Republican: health bill to be dis...
2359     Pro-Trump Group Is Now Using Pictures Of An O...
Name: Text, Length: 1980, dtype: object

In [13]:
type(X_train.values)

numpy.ndarray

In [14]:
X_train.values

array(['U.S. Senate votes near unanimously for Russia, Iran sanctions WASHINGTON (Reuters) - The U.S. Senate voted nearly unanimously on Thursday for legislation to impose new sanctions on Russia and force President Donald Trump to get Congress’ approval before easing any existing sanctions on Russia. In a move that could complicate U.S. President Donald Trump’s desire for warmer relations with Moscow, the Senate backed the measure by 98-2. Republican Senator Rand Paul and Bernie Sanders, an independent who caucuses with the Democrats, were the only two “no” votes. The measure is intended to punish Russia for meddling in the 2016 U.S. election, its annexation of Ukraine’s Crimea region and support for Syria’s government in the six-year-long civil war. If passed in the House of Representatives and signed into law by Trump, it would put into law sanctions previously established via former President Barack Obama’s executive orders, including some on Russian energy projects. The legislatio

## Create bag of words representation using CountVectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
v = CountVectorizer()

In [18]:
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<7920x52212 sparse matrix of type '<class 'numpy.int64'>'
	with 1740470 stored elements in Compressed Sparse Row format>

In [19]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [20]:
X_train_cv.shape

(7920, 52212)

In [21]:
X_train_cv[0].toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [22]:
v.get_feature_names_out()

array(['00', '000', '0000', ..., 'zztaine', 'zzzzaaaacccchhh', 'émigré'],
      dtype=object)

In [23]:
pd.DataFrame(X_train_cv.toarray(), columns = v.get_feature_names_out())

Unnamed: 0,00,000,0000,000063,00007,00042,0009,000s,000th,001,...,zxf0xxckuo,zxsffo7sow,zyf8nsgkf0,zyklon,zypries,zzbluecomet,zzsg90pbf6,zztaine,zzzzaaaacccchhh,émigré
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7916,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
v.vocabulary_

{'senate': 41929,
 'votes': 50034,
 'near': 32281,
 'unanimously': 48357,
 'for': 19394,
 'russia': 40755,
 'iran': 25053,
 'sanctions': 41074,
 'washington': 50348,
 'reuters': 39795,
 'the': 46463,
 'voted': 50029,
 'nearly': 32288,
 'on': 33724,
 'thursday': 46799,
 'legislation': 27972,
 'to': 47032,
 'impose': 23972,
 'new': 32491,
 'and': 5031,
 'force': 19405,
 'president': 36774,
 'donald': 15619,
 'trump': 47741,
 'get': 20571,
 'congress': 11934,
 'approval': 5456,
 'before': 7057,
 'easing': 16359,
 'any': 5284,
 'existing': 17930,
 'in': 24038,
 'move': 31612,
 'that': 46451,
 'could': 12533,
 'complicate': 11658,
 'desire': 14472,
 'warmer': 50299,
 'relations': 39108,
 'with': 51095,
 'moscow': 31513,
 'backed': 6382,
 'measure': 30208,
 'by': 9249,
 '98': 3313,
 'republican': 39440,
 'senator': 41932,
 'rand': 38171,
 'paul': 34933,
 'bernie': 7301,
 'sanders': 41085,
 'an': 4979,
 'independent': 24225,
 'who': 50820,
 'caucuses': 9898,
 'democrats': 14160,
 'were': 5062

In [25]:
X_train_np = X_train_cv.toarray()
X_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [26]:
np.where(X_train_np[0]!=0)

(array([  250,   636,   833,  2072,  3313,  3562,  3588,  3819,  3832,
         3891,  3956,  3976,  4206,  4217,  4276,  4329,  4642,  4673,
         4674,  4730,  4795,  4874,  4979,  5031,  5157,  5284,  5456,
         5538,  5541,  5706,  5926,  5996,  6114,  6372,  6382,  6386,
         6545,  6651,  6940,  6999,  7018,  7048,  7057,  7067,  7301,
         7521,  7862,  8013,  9177,  9249,  9461,  9898, 10168, 10189,
        10216, 10808, 11062, 11435, 11467, 11478, 11527, 11532, 11578,
        11658, 11832, 11934, 12133, 12182, 12396, 12533, 12590, 12854,
        13247, 13717, 13718, 13935, 14160, 14472, 14909, 15076, 15177,
        15619, 15940, 16182, 16351, 16359, 16563, 16685, 17047, 17098,
        17128, 17204, 17529, 17557, 17640, 17641, 17707, 17761, 17877,
        17930, 17964, 18048, 18055, 18133, 18658, 18879, 18914, 18985,
        19162, 19394, 19405, 19433, 19503, 19619, 19840, 20049, 20541,
        20571, 21089, 21582, 21759, 22123, 22187, 22663, 22782, 22902,
      

In [27]:
X_train_np[0][1881]

0

## Train the naive bayes model

In [29]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv, y_train)

## Evaluate Performance

In [31]:
X_test_cv = v.transform(X_test)
X_test_cv.toarray().shape

(1980, 52212)

In [32]:
model.score(X_test_cv, y_test)

0.9762626262626263

In [33]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        Fake       0.98      0.98      0.98      1020
        Real       0.97      0.98      0.98       960

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980



## Testing the news

In [35]:
Text = [
        "Breaking: Massive earthquake shakes the west coast, causing widespread damage.",
        "Trump announces new economic policy, sparks debate among experts."
    ]

In [36]:
Text_count = v.transform(Text)
Text_count.toarray().shape

(2, 52212)

In [37]:
model.predict(Text_count)

array(['Fake', 'Real'], dtype='<U4')

In [38]:
current_nwes = [
    "World leaders gather for climate summit to address global warming crisis.",
    "Tech giant launches revolutionary AI tool, transforming industries overnight."
]

In [39]:
current_nwes_count = v.transform(current_nwes)
current_nwes_count.toarray().shape

(2, 52212)

In [40]:
model.predict(current_nwes_count)

array(['Real', 'Real'], dtype='<U4')

In [41]:
bbc_nwes = [
    "Russian criminals helped UK drug gangs launder lockdown cash",
    "Moment of big opportunity and high risk for Marine Le Pen"
]

In [42]:
bbc_news_count = v.transform(bbc_nwes)
bbc_news_count.toarray().shape

(2, 52212)

In [43]:
model.predict(bbc_news_count)

array(['Real', 'Real'], dtype='<U4')

## Testing another news

In [82]:
email_dataset = [
    "Residents of New York City reported seeing strange lights in the sky last night, with some speculating that they were extraterrestrial beings. Scientists have yet to confirm or deny the sightings, but social media is abuzz with excitement.",
    "A groundbreaking study from a little-known research institute claims that chocolate, specifically dark chocolate, can cure cancer. Experts have raised concerns about the lack of peer-reviewed research to support these findings.",
    "NASA scientists have confirmed the discovery of a new exoplanet located in the habitable zone of its star. The planet has water, raising hopes of finding extraterrestrial life in the near future.",
    "Elon Musk has unveiled a detailed roadmap for SpaceX’s mission to colonize Mars by 2030. The ambitious project will involve sending multiple spacecraft to the red planet and establishing a permanent human presence.",
    "Experts predict that a new ice age will begin in 2025 due to shifts in the Earth's magnetic poles and climate patterns. Authorities are urging people to prepare for drastic weather changes."
]

In [84]:
email_dataset_count = v.transform(email_dataset)
email_dataset_count.toarray().shape

(5, 52212)

In [86]:
model.predict(email_dataset_count)

array(['Fake', 'Fake', 'Real', 'Real', 'Real'], dtype='<U4')