In [1]:
import os
os.chdir("D:/Fake News Detection")

In [2]:
import pandas as pd
# Importing fake news dataset
fake=pd.read_csv('politifact_fake.csv')
fake["Remarks"]="fake"

# Importing real news dataset
real=pd.read_csv('politifact_real.csv')
real["Remarks"]="real"

#joining the 2 dataframes
merged=pd.concat([fake,real],join="inner")
dataframe =merged.copy()
dataframe.head()

Unnamed: 0,id,news_url,title,tweet_ids,Remarks
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,fake
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,fake
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,fake
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,fake
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,fake


In [3]:
x = dataframe['title']
y = dataframe['Remarks']

In [4]:
x

0      BREAKING: First NFL Team Declares Bankruptcy O...
1      Court Orders Obama To Pay $400 Million In Rest...
2      UPDATE: Second Roy Moore Accuser Works For Mic...
3             Oscar Pistorius Attempts To Commit Suicide
4            Trump Votes For Death Penalty For Being Gay
                             ...                        
619    Flake: “Religious tests should have no place i...
620                             Change We Can Believe In
621    deputy director of national health statistics ...
622    Romneys ProLife Conversion Myth or Reality Jun...
623                               Interest Group Ratings
Name: title, Length: 1097, dtype: object

In [5]:
y

0      fake
1      fake
2      fake
3      fake
4      fake
       ... 
619    real
620    real
621    real
622    real
623    real
Name: Remarks, Length: 1097, dtype: object

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
y_train

384    fake
351    fake
471    fake
170    real
188    fake
       ... 
560    real
290    real
362    real
86     real
211    real
Name: Remarks, Length: 877, dtype: object

In [8]:
y_train

384    fake
351    fake
471    fake
170    real
188    fake
       ... 
560    real
290    real
362    real
86     real
211    real
Name: Remarks, Length: 877, dtype: object

In [9]:
import numpy as np
tfvect = TfidfVectorizer(stop_words='english',max_df=0.2)
tfid_x_train = tfvect.fit_transform(x_train.apply(lambda x: np.str_(x)))
tfid_x_test = tfvect.transform(x_test.apply(lambda x: np.str_(x)))

* max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
* max_df = 25 means "ignore terms that appear in more than 25 documents".

In [10]:
classifier = PassiveAggressiveClassifier(max_iter=50)
classifier.fit(tfid_x_train,y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=50, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [11]:
y_pred = classifier.predict(tfid_x_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 85.45%


In [12]:
cf = confusion_matrix(y_test,y_pred, labels=['fake','real'])
print(cf)

[[ 64  11]
 [ 21 124]]


In [14]:
def fake_news_det(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    prediction = classifier.predict(vectorized_input_data)
    print(prediction)

In [15]:
fake_news_det('Court Orders Obama To Pay $400 Million In Restitution')

['fake']


In [16]:
fake_news_det("Target to Discontinue Sale of Holy Bible")

['fake']


In [17]:
import pickle
pickle.dump(classifier,open('model.pkl', 'wb'))

In [18]:
# load the model from disk
loaded_model = pickle.load(open('model.pkl', 'rb'))

In [20]:
def fake_news_det1(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    prediction = loaded_model.predict(vectorized_input_data)
    print(prediction)

In [21]:
fake_news_det1("'This Week' Transcript: Holder and Giuliani")

['real']


In [22]:
fake_news_det1("Who Benefits from President Trumpâ€™s Child Care Proposals")

['real']


In [23]:
fake_news_det('"New Energy" Ad')

['real']
