In [6]:
# Fake News - Make the required imports  
import numpy as np  
import pandas as pd  
import itertools  
from sklearn.model_selection import train_test_split  
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.linear_model import PassiveAggressiveClassifier  
from sklearn.metrics import accuracy_score, confusion_matrix

In [11]:
# Now we read the data into a DataFrame (df) and get the shape of the data and the first 5 records  

# Read the data  
df = pd.read_csv('news.csv')  

# Get the shape and head of the DataFrame  
# 'shape' returns the number of rows and columns of the df  
# 'head'  returns the header row and the first 5 rows or the specified number of rows
df.shape  
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [16]:
# Fake News - Get the labels of the DataFrame  
# unlike 'shape' or 'head', 'label' is already a part of the df
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [28]:
# Fake News - Split the dataset into training sets and testing sets  

# train_test_split() is used to split the data into training and testing sets  
# df['test"] is extracting the 'text' column from our DataFrame which has the text data we want to use for training and testing  
# test_size parameter specifies the amount of your data you are allocating to your training set. Here we use 20%. The remaining 80% will be used for training  

x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size = 0.2, random_state = 7)

In [21]:
# Initialize a TfidfVectorizer with stop words from the English language and a maximum document frequency of 0.7  
# (Terms with a higher document frequency will be discarded. The reverse will be true for an inverse document frequency)  
# **Stop words are the most common words in a language that need to be filtered out before processing the natural language data  

# I'll need comments on the fit_transform and transform for better understanding

# Fake News - Intializing a TfidfVectorizer  
tfidf_vectorizer = TfidfVectorizer (stop_words = 'english', max_df = 0.7)  

# Fake News - Fit and transform vectorizer on train set, transform vectorizer on test set  
tfidf_train = tfidf_vectorizer.fit_transform(x_train)  
tfidf_test = tfidf_vectorizer.transform(x_test)  


In [26]:
# Next, intialize a PassiveAggressiveClassifier and fit it on tfidf_train and y_train  

# Fake News - Initializing a PassiveAggressiveClassifier  
pac = PassiveAggressiveClassifier(max_iter = 50)  
pac.fit(tfidf_train, y_train)  

# Fake News - Predict on the test set and calculate accuracy  
y_pred = pac.predict(tfidf_test)  
# print(y_pred)  
score = accuracy_score(y_test, y_pred)  
# print(score)  
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.82%


In [27]:
# With an accuracy of 92.82% with this model, 
# let's print out a confusion matrix to gain insight into the number of false and true negatives and positives  

# Fake News - Build confusion matrix***  
confusion_matrix(y_test, y_pred, labels = ['FAKE', 'REAL'])

array([[590,  48],
       [ 43, 586]], dtype=int64)

With this model, we have 590 true positives, 586 false positives, 48 false negatives and 43 false positives