# 0.4.0 Building a "fake news" classifier

In [1]:
%load_ext autoreload
%autoreload 2

In [21]:
import pandas as pd
from matplotlib import pyplot as plt
import re

from nltk import pos_tag, ne_chunk_sents
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from gensim.corpora.dictionary import Dictionary

from gensim.models.tfidfmodel import TfidfModel

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# from polyglot.text import Text

import itertools
from collections import defaultdict

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../") 

from utils.info import article_m3, article_f
import utils.paths as path
from utils.paths2 import direcciones, direcciones_cursos

## 0.4.2 Which possible features?

Which of the following are possible features for a text classification problem?

R:/ All of the above.

## 0.4.3 Training and testing

What datasets are needed for supervised learning?

R:/ Both training and testing data.

## 0.4.5 CountVectorizer for text classification

In [4]:
df = pd.read_csv('../data/raw/fake_or_real_news.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
df = df.loc[:,['title','text','label']]
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
# Create a series to store the labels: y
y = df.label
y

0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
5       FAKE
6       FAKE
7       REAL
8       REAL
9       REAL
10      REAL
11      REAL
12      FAKE
13      FAKE
14      REAL
15      REAL
16      FAKE
17      FAKE
18      REAL
19      REAL
20      REAL
21      FAKE
22      REAL
23      REAL
24      FAKE
25      REAL
26      REAL
27      REAL
28      REAL
29      FAKE
30      REAL
31      FAKE
32      FAKE
33      FAKE
34      FAKE
35      FAKE
36      FAKE
37      FAKE
38      FAKE
39      FAKE
40      REAL
41      REAL
42      FAKE
43      FAKE
44      REAL
45      FAKE
46      REAL
47      REAL
48      FAKE
49      FAKE
50      REAL
51      REAL
52      FAKE
53      FAKE
54      FAKE
55      REAL
56      FAKE
57      REAL
58      FAKE
59      REAL
60      FAKE
61      REAL
62      REAL
63      FAKE
64      REAL
65      REAL
66      REAL
67      REAL
68      REAL
69      FAKE
70      REAL
71      FAKE
72      REAL
73      REAL
74      FAKE
75      REAL
76      REAL

In [35]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'],y,test_size=0.33,random_state=53)

In [36]:
# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer

In [37]:
# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)
count_train

<4244x56922 sparse matrix of type '<class 'numpy.int64'>'
	with 1119820 stored elements in Compressed Sparse Row format>

In [38]:
# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)
count_test

<2091x56922 sparse matrix of type '<class 'numpy.int64'>'
	with 533697 stored elements in Compressed Sparse Row format>

In [10]:
# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names_out()[:10])

['00' '000' '0000' '00000031' '000035' '00006' '0001' '0001pt' '000ft'
 '000km']


## 0.4.6 TfidfVectorizer for text classification

In [11]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'],y,test_size=0.30,random_state=53)

In [12]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_vectorizer

In [46]:
# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_train

<4244x56922 sparse matrix of type '<class 'numpy.float64'>'
	with 1119820 stored elements in Compressed Sparse Row format>

In [47]:
# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf_test

<2091x56922 sparse matrix of type '<class 'numpy.float64'>'
	with 533697 stored elements in Compressed Sparse Row format>

In [15]:
# Print the first 10 features
print(tfidf_vectorizer.get_feature_names_out()[:10])

['00' '000' '0000' '00000031' '000035' '00006' '0001' '0001pt'
 '000billion' '000ft']


In [16]:
# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.04169599 0.         ... 0.         0.         0.        ]
 [0.         0.03144782 0.         ... 0.         0.         0.        ]
 [0.         0.01437699 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


## 0.4.7 Inspecting the vectors

In [17]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names_out())
# count_df.head()

In [18]:
# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names_out())
# tfidf_df.head()

In [19]:
# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

set()


In [20]:
# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))

False


## 0.4.9 Text classification models

Which of the below is the most reasonable model to use when training a new supervised model using text vector data?

R:/ Naive Bayes

## 0.4.10 Training and testing the "fake news" model with CountVectorizer

In [39]:
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

In [40]:
# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

In [41]:
# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

In [42]:
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test,pred)
print(score)

0.893352462936394


In [43]:
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test,pred,labels=['FAKE', 'REAL'])
print(cm)

[[ 865  143]
 [  80 1003]]


## 0.4.11 Training and testing the "fake news" model with TfidfVectorizer

In [44]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

In [48]:
# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

In [49]:
# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

In [50]:
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test,pred)
print(score)

0.8565279770444764


In [51]:
# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test,pred,labels=['FAKE', 'REAL'])
print(cm)

[[ 739  269]
 [  31 1052]]
