In [2]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

In [16]:
#Read the data
df=pd.read_csv('C:\\Users\\mayan\\OneDrive\\Desktop\\Mayank\\Summer Training 2021\\news.csv')

#Get shape and head
print(df.shape)
df.head()

(6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
#Get the labels
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [4]:
#Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=42)

In [5]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [6]:
#Initialize a CountVectorizer
count_vectorizer=CountVectorizer(stop_words='english', max_df=0.7)

#Fit and transform train set, transform test set
count_train=count_vectorizer.fit_transform(x_train) 
count_test=count_vectorizer.transform(x_test)

In [7]:
#Initialize a PassiveAggressiveClassifier with TfIdf
model_pat=PassiveAggressiveClassifier(max_iter=50)
model_pat.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=model_pat.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

Accuracy: 94.0%


array([[592,  36],
       [ 40, 599]], dtype=int64)

In [8]:
#Initialize a PassiveAggressiveClassifier with Count
model_pac=PassiveAggressiveClassifier(max_iter=50)
model_pac.fit(count_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=model_pac.predict(count_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

Accuracy: 90.53%


array([[561,  67],
       [ 53, 586]], dtype=int64)

In [9]:
#Initialize a MultinomialNB with TfIdf
model_mnt=MultinomialNB()
model_mnt.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=model_mnt.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

Accuracy: 84.53%


array([[443, 185],
       [ 11, 628]], dtype=int64)

In [10]:
#Initialize a MultinomailNB with Count
model_mnc=MultinomialNB()
model_mnc.fit(count_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=model_mnc.predict(count_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

Accuracy: 89.9%


array([[544,  84],
       [ 44, 595]], dtype=int64)

In [11]:
#Initialize a LogisticRegressionCV with TfIdf
model_lrt=LogisticRegressionCV(cv=5, random_state=42, max_iter=300)
model_lrt.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=model_lrt.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

Accuracy: 93.61%


array([[592,  36],
       [ 45, 594]], dtype=int64)

In [12]:
#Initialize a LogisticRegression with count
model_lrt=LogisticRegressionCV(cv=5,random_state=42, max_iter=300)
model_lrt.fit(count_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=model_lrt.predict(count_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

Accuracy: 92.19%


array([[585,  43],
       [ 56, 583]], dtype=int64)

# We Get Best Result From Passive Aggresive Classifier

In [13]:
joblib.dump(model_pat, 'save_model.pkl')
joblib.dump(tfidf_vectorizer, 'save_vectorizer.pkl')

['save_vectorizer.pkl']

In [14]:
# load the model from disk
loaded_model = joblib.load('save_model.pkl')
loaded_vectorizer = joblib.load('save_vectorizer.pkl')

In [1]:
#testing the model on real world news examples 
#first test case
text=['Joe Biden Calls Trump Supporters "Dregs of Society".']
prepared_test=loaded_vectorizer.transform(text)
result = loaded_model.predict(prepared_test)
print(result)

#second test case
text=['CJI NV Ramana said the Supreme Court has been portrayed as the villain after it questioned the Delhi government on Thursday regarding schools functioning amid pollution. A day after the Delhi government suspended physical classes in the schools of the capital owing to the increasing levels of pollution, the Supreme Court on Friday clarified that it did not ask the government to shut schools. It only asked the reasons behind the change in the stance of the government, the Chief Justice of India-led bench said as it continued hearing the case. The Apex Court on Thursday gave the Delhi government a 24-hour deadline to place some concrete steps against pollution. As the government listed the decisions it took, including that of the suspension of physical classes, the Supreme Court gave go-ahead to the Delhi government for the construction works of hospitals.']
prepared_test=loaded_vectorizer.transform(text)
result = loaded_model.predict(prepared_test)
print(result)

NameError: name 'loaded_vectorizer' is not defined