In [1]:
#let's import required libraries

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('./data/fake_news_data.csv')

In [3]:
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
id        20800 non-null int64
title     20242 non-null object
author    18843 non-null object
text      20761 non-null object
label     20800 non-null int64
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
df.dtypes

id         int64
title     object
author    object
text      object
label      int64
dtype: object

In [6]:
df.shape

(20800, 5)

In [7]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [8]:
#let's check for null values in data set
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
#let's remove empty data
df = df.dropna()

In [10]:
df.shape

(18285, 5)

In [11]:
labels=df.label
labels.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [12]:
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.3, random_state=7)

In [13]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(12799,) (5486,) (12799,) (5486,)


In [15]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [21]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer=tfidf_train[0]
 
# place tf-idf values in a data frame
df_tfidf = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df_tfidf.sort_values(by=["tfidf"],ascending=False)
#We can below tfidf scores of works

Unnamed: 0,tfidf
kahn,0.262575
strauss,0.260699
oligarchy,0.251948
hillary,0.244037
presstitutes,0.237572
...,...
gem,0.000000
gelöst,0.000000
gelés,0.000000
gelände,0.000000


In [22]:
#Let's apply PassiveAggressiveClassifier and check model accuracy
pac_model = PassiveAggressiveClassifier(max_iter=50)
pac_model.fit(tfidf_train,y_train)
y_pred = pac_model.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 96.21%


In [46]:
y_test

7961     1
4341     0
10712    0
20378    1
15780    0
        ..
18240    0
9532     1
3167     1
18673    0
19371    0
Name: label, Length: 5486, dtype: int64

In [16]:
confusion_matrix(y_test,y_pred, labels=[0,1])


array([[3020,   94],
       [ 114, 2258]])

In [17]:
#So with this model, we have 3020 true positives, 94 true negatives, 114 false positives, and 2258 false negatives.

In [34]:
#Lets check with Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc_model = RandomForestClassifier()
rfc_model.fit(tfidf_train,y_train)
rfc_pred = rfc_model.predict(tfidf_test)
score = accuracy_score(y_test,rfc_pred)
print(f'Accuracy: {round(score*100,2)}%')



Accuracy: 86.0%


In [35]:
confusion_matrix(y_test,rfc_pred, labels=[0,1])

array([[2950,  164],
       [ 604, 1768]])

In [36]:
#So with this model, we have 2950 true positives, 164 true negatives, 604 false positives, and 1768 false negatives.

In [39]:
#Lets check with Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dtc_model = DecisionTreeClassifier()
dtc_model.fit(tfidf_train,y_train)
dtc_pred = dtc_model.predict(tfidf_test)
score = accuracy_score(y_test,dtc_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 88.19%


In [40]:
confusion_matrix(y_test,dtc_pred, labels=[0,1])

array([[2754,  360],
       [ 288, 2084]])

In [25]:
test_df = pd.read_csv('./data/test.csv')

In [26]:
test_df

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [27]:
test_df.isna().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [28]:
test_df.dtypes


id         int64
title     object
author    object
text      object
dtype: object

In [29]:
# Handling missing value
for i in test_df.columns:
    test_df[i] = test_df[i].fillna(test_df[i].mode()[0])

In [30]:
test_df.shape
test_df.reset_index(inplace =True)

In [31]:
test_df

Unnamed: 0,index,id,title,author,text
0,0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,1,20801,Russian warships ready to strike terrorists ne...,Pam Key,Russian warships ready to strike terrorists ne...
2,2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...,...
5195,5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,5198,25998,300 US Marines To Be Deployed To Russian Borde...,Pam Key,« Previous - Next » 300 US Marines To Be Deplo...


In [32]:
tfidf_test_df=tfidf_vectorizer.transform(test_df['text'])

In [33]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer_test = tfidf_test_df[0]
 
# place tf-idf values in a data frame
df_tfidf_test = pd.DataFrame(first_vector_tfidfvectorizer_test.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df_tfidf_test.sort_values(by=["tfidf"],ascending=False)
#We can below tfidf scores of works

Unnamed: 0,tfidf
tech,0.288783
silicon,0.250816
mr,0.228603
hoffman,0.225522
venture,0.224439
...,...
gemi,0.000000
gemerkt,0.000000
gemeliers,0.000000
gemeldet,0.000000


In [35]:
tfidf_test_df.shape

(5200, 134602)

In [36]:
y_pred_test_df = pac.predict(tfidf_test_df)

In [37]:
y_pred_test_df

array([0, 1, 1, ..., 0, 1, 0])

In [44]:
len(y_pred_test_df)

5200