# Loading the Dataset

In [2]:
# Loading the dataset

import pandas as pd
import json

path = 'datasets/kaggle/fake_or_real_news.csv'

def load_dataset(dataset_path):
    return pd.read_csv(dataset_path, on_bad_lines='skip')


dataset_df= load_dataset(path)

dataset_df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


# Exploring the dataset

In [3]:
## TODO: How to print this information in a better way? Maybe using markdown

# Typecasting df to list 
dataset_df_list = list(dataset_df) 
  

print("There are " + str(len(dataset_df_list)) + " rows in the dataset being them: ") 
dataset_df.columns

There are 4 rows in the dataset being them: 


Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [4]:
print("There are " + str(len(dataset_df.index)) + " number of rows in the dataset")

There are 6335 number of rows in the dataset


#### By observing the above I can see that there are 4 columns in the dataset, being them: 'Unnamed: 0', 'title', 'text' and 'label'. 
#### I do not know what 'Unnamed: 0' is, but it looks like some kind of ID.
#### This dataset contains 6335 rows.
#### This dataset contains a column called 'label' which classifies the text as either being fake news or real news.

In [5]:
# I can see below that the amount of unique values in 'Unnamed: 0' is the same number of rows in the dataset, with this information
# I can assume it is indeed an unique id.
dataset_df['Unnamed: 0'].nunique()

6335

In [6]:
# so I will rename the column to ID so we can give a better name to it.Renaming 'unnamed: 0' to ID so we can give a name to it.
# dataset_df = dataset_df.rename(columns={"Unnamed: 0": "ID"}).sort_values(by="ID", axis=0)
dataset_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [7]:
## TODO: FIX

# Creating a word cloud for the news articles
# !pip install wordcloud

from wordcloud import WordCloud
# Start with one review:
text = dataset_df.text[0]

# Create and generate a word cloud image:
# wordcloud = WordCloud().generate(text)

# # texts = " ".join(text for text in dataset_df["text"])

# wc = WordCloud(collocations=False, background_color='white', width=2048, height=1080).generate(texts)
# plt.imshow(wc, interpolation='bilinear')
# plt.axis("off")
# plt.show()

#### By alanysing the sorted data I can see that we have also sorted the real news from the fake news.

# Classification Model

In [8]:
# Setting the features we are going to use to train our model: the text, title and label columns
texts = dataset_df['text'].values
titles = dataset_df['title'].values
labels = dataset_df['label'].values

In [9]:
# Training the model on the text articles in the dataset
from sklearn.feature_extraction.text import CountVectorizer

# Converting to vectors
def vectorize_data(data):
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform(data)
    return vectors;

vectorized_texts = vectorize_data(texts)

In [10]:
# Split the data into a training set and a test set using the vectors, using 50% of the data for the test size
from sklearn.model_selection import train_test_split

X_train_texts, X_test_texts, y_train_texts, y_test_texts = train_test_split(vectorized_texts, labels, test_size=0.50, random_state=42)

In [29]:
# Fake news classifier using sklearn and Multinomial Naives Bayern

# Train the model
from sklearn.naive_bayes import MultinomialNB

def train(X_train, y_train):
    model = MultinomialNB()
    model.fit(X_train, y_train)

    return model;


model  = train(X_train_texts, y_train_texts);


# Testing the model using the test data
y_pred = model.predict(X_test_texts)

results_texts['prediction'] = y_pred ## Adding the prediction in a new column so we can compare

results_texts.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_texts['prediction'] = y_pred ## Adding the prediction in a new column so we can compare


Unnamed: 0.1,Unnamed: 0,title,text,label,prediction
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,FAKE
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,REAL
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,FAKE
...,...,...,...,...,...
95,8290,The Mandela Effect was made by one overlooked ...,"link There is simply no more denying, for mill...",FAKE,REAL
96,6227,CNN: One voter can make a difference by voting...,Channel list \nFollowing hurricane Matthew's f...,FAKE,REAL
97,3255,Give Social Security recipients a CEO-style raise,(CNN) On Veterans Day we recognize and honor t...,REAL,REAL
98,3177,"Fireworks erupt between Trump and Bush, Rubio ...",Sparks flew at the toughest and liveliest GOP ...,REAL,FAKE


In [30]:
## Now training in the model using the title column
vectorized_titles = vectorize_data(titles)

# Splitting the data
X_train_titles, X_test_titles, y_train_titles, y_test_titles = train_test_split(vectorized_titles, labels, test_size=0.33, random_state=42)

# Training the model
model = train(X_train_titles, y_train_titles)

# Testing the trained model against the test data
y_pred_titles =  model.predict(X_test_titles)

## Adding the prediction in a new column so we can compare
results_titles['prediction'] = y_pred_titles
results_titles.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_titles['prediction'] = y_pred_titles ## Adding the prediction in a new column so we can compare


Unnamed: 0.1,Unnamed: 0,title,text,label,prediction
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,FAKE
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,FAKE
...,...,...,...,...,...
95,8290,The Mandela Effect was made by one overlooked ...,"link There is simply no more denying, for mill...",FAKE,FAKE
96,6227,CNN: One voter can make a difference by voting...,Channel list \nFollowing hurricane Matthew's f...,FAKE,REAL
97,3255,Give Social Security recipients a CEO-style raise,(CNN) On Veterans Day we recognize and honor t...,REAL,REAL
98,3177,"Fireworks erupt between Trump and Bush, Rubio ...",Sparks flew at the toughest and liveliest GOP ...,REAL,FAKE


# Model accuracy evaluation

In [37]:
## TODO: FIX
# Evaluating the accuracy of the model comparing the test and prediction data
import numpy as np

def calculate_accuracy(X, y):
    # accuracy = model.score(y_pred, y_test)
    model.score(X, y)
    return accuracy
    
def calculate_accuracy_mean(X, y):
    accuracy = np.mean(X == y)
    return accuracy
    
print("Text fake news classification using Multinomial Naives Bayes:" + str(calculate_accuracy_mean(y_test_texts, y_pred_texts)))
print("Titles fake news classification using Multinomial Naives Bayes:" + str(calculate_accuracy_mean(y_test_titles, y_pred_titles)))

  accuracy = np.mean(X == y)


ValueError: Unable to coerce to Series, length must be 4: given 3168

### By analysing the above we can see that the model trained on top of a dataset with more words in it (the text column) has more precision.