## Model created with TF_IDF Vector and Passive Aggressive Classifier Algorithm - 92 percent Accuracy

In [14]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score 
from sklearn import metrics
import matplotlib.pyplot as plt

In [15]:
df=pd.read_csv('news.csv', index_col=None)

In [16]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [17]:
# Use axis = 1 to delete columns and axis = 0 to delete rows
dataset=df.drop("Unnamed: 0",axis=1)

In [18]:
dataset

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [19]:
#output assigned to y
y=dataset["label"]

In [20]:
#using the train_test_split function from the sklearn.model_selection module in scikit-learn
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], y, test_size=0.33, random_state=53)

In [21]:
# Initialize the Hashing Vectorizer;The larger the number, the more granularity you get, but it also increases the size of the vector and the computational cost.
hashing_vectorizer = HashingVectorizer(stop_words='english', n_features=56922)  # You can adjust n_features as needed

# Fit and transform the training data;Unlike fit_transform, transform is used here for both training and test data, as HashingVectorizer doesn’t require fitting.
hashing_train = hashing_vectorizer.transform(X_train)

# Transform the test data
hashing_test = hashing_vectorizer.transform(X_test)


In [12]:
print(hashing_train)
# To print in dense format
print(hashing_train.toarray())

#The values you see from HashingVectorizer are the result of applying a hash function and can include negative values, which can arise due to the way the hash function is designed and the normalization process used. The vector size is fixed and specified by the n_features parameter.

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1093531 stored elements and shape (4244, 10000)>
  Coords	Values
  (1, 1208)	-0.1336306209562122
  (1, 1562)	-0.1336306209562122
  (1, 1838)	0.1336306209562122
  (1, 1856)	-0.1336306209562122
  (1, 1932)	0.2672612419124244
  (1, 2953)	0.1336306209562122
  (1, 2976)	0.2672612419124244
  (1, 3488)	-0.1336306209562122
  (1, 3857)	0.1336306209562122
  (1, 4045)	-0.1336306209562122
  (1, 5048)	-0.2672612419124244
  (1, 5292)	-0.1336306209562122
  (1, 5335)	-0.2672612419124244
  (1, 5667)	-0.1336306209562122
  (1, 6070)	0.1336306209562122
  (1, 6410)	-0.1336306209562122
  (1, 6579)	-0.2672612419124244
  (1, 7603)	0.1336306209562122
  (1, 7789)	-0.1336306209562122
  (1, 8002)	-0.2672612419124244
  (1, 9197)	-0.5345224838248488
  (1, 9340)	-0.1336306209562122
  (1, 9559)	-0.1336306209562122
  (2, 197)	-0.3333333333333333
  (2, 1353)	-0.3333333333333333
  :	:
  (4243, 8320)	0.028261670947211076
  (4243, 8415)	0.028261670947211076
  (

In [10]:
#number of unique values via get_feature_names_out is not available for hashing - only for counter and TF_IDF - you get it via features n representation
# len(hashing_vectorizer.get_feature_names_out())

In [22]:
#converting the sparse matrix hashing_train to a dense NumPy array and then printing it
print(hashing_train.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [23]:
# Get the shape of the matrix
print(hashing_train.shape)

(4244, 56922)


In [24]:
# MultinomialNB can't be used here as it cannot handle negative values so we use passive aggresive classifier

# Initialize the PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier()

# Fit the model on the training data
clf.fit(hashing_train, y_train)

# Predict on the test data
pred = clf.predict(hashing_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, pred)
print(f"Accuracy: {accuracy:.4f}")

cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])

Accuracy: 0.9230


In [25]:
from sklearn.metrics import classification_report

report=classification_report(y_test, pred)

In [26]:
print(report)

              precision    recall  f1-score   support

        FAKE       0.91      0.93      0.92      1008
        REAL       0.93      0.92      0.93      1083

    accuracy                           0.92      2091
   macro avg       0.92      0.92      0.92      2091
weighted avg       0.92      0.92      0.92      2091



## Let's test the data 

In [27]:
#taking first row from our dataset
dataset["text"][0]



In [29]:
#we trained model using counter vector modified number - so we cannot provide direct text, we need to provide the input data in counter vector format prior to predication
hashing_train[[0]]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 0 stored elements and shape (1, 56922)>

In [30]:
# Initialize the TF-IDF Vectorizer with stop words removed
hashing_vectorizer = HashingVectorizer(stop_words='english', n_features=56922)

# Fit and transform the training data;Unlike fit_transform, transform is used here for both training and test data, as HashingVectorizer doesn’t require fitting.
hashing_train = hashing_vectorizer.transform(X_train)


In [31]:
X_train[[0]]

0    Daniel Greenfield, a Shillman Journalism Fello...
Name: text, dtype: object

In [33]:
clf.predict(hashing_train[[0]])

array(['FAKE'], dtype='<U4')