In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import pickle

In [2]:
df = pd.read_csv("news_dataset.csv")

In [3]:
df.shape

(28711, 5)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,content,publication,label
0,0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,100percentfedup,fake
1,1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,100percentfedup,fake
2,2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,100percentfedup,fake
3,3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,100percentfedup,fake
4,4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,100percentfedup,fake


In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().any()

Unnamed: 0     False
title          False
content        False
publication    False
label          False
dtype: bool

In [7]:
labels=df.label
labels.head()

0    fake
1    fake
2    fake
3    fake
4    fake
Name: label, dtype: object

In [9]:
x_train,x_test,y_train,y_test=train_test_split(df['content'], labels, test_size=0.2, random_state=7)

In [10]:
x_train = x_train.apply(lambda x: np.str_(x))
x_test  = x_test.apply(lambda x: np.str_(x))

In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [12]:
tfidf_train = tfidf_vectorizer.fit_transform(x_train)

In [13]:
tfidf_test  = tfidf_vectorizer.transform(x_test)

In [14]:
pac = PassiveAggressiveClassifier(max_iter=50)

In [15]:
pac.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [16]:
y_pred=pac.predict(tfidf_test)

In [17]:
score=accuracy_score(y_test,y_pred)

In [18]:
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.82%


In [19]:
confusion_matrix(y_test,y_pred, labels=['fake','real'])

array([[2228,  198],
       [ 148, 3023]])

In [20]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('nbmodel', MultinomialNB())])

In [21]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [22]:
pipeline.score(x_test,y_test)

0.7025192067178846

In [23]:
pred = pipeline.predict(x_test)

In [24]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

        fake       0.99      0.32      0.48      2426
        real       0.66      1.00      0.79      3171

    accuracy                           0.70      5597
   macro avg       0.82      0.66      0.64      5597
weighted avg       0.80      0.70      0.66      5597



In [25]:
print(confusion_matrix(y_test,pred))

[[ 766 1660]
 [   5 3166]]


In [26]:
with open('model.pk1','wb') as handle:
    pickle.dump(pipeline, handle, protocol = pickle.HIGHEST_PROTOCOL)