In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import spacy


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
# Load the fake_news data
fake_news = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
fake_news.head()

In [5]:
# Load the real_news data
real_news = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
real_news.head()

In [6]:
# subjects of fake news
fake_news.subject.unique()

In [7]:
# subjects of real news
real_news.subject.unique()

In [8]:
fake_news.shape

In [9]:
real_news.shape

In [11]:
# Add a categorical target to data to label fake news 1 and real news 0
fake_news['is_fake']=np.ones(fake_news.shape[0])
real_news['is_fake']=np.zeros(real_news.shape[0])

In [12]:
# concatenate fake news and real news along the index axis to generate news dataframe
news = pd.concat([fake_news, real_news], axis=0)

In [13]:
news.shape

In [14]:
# Shuffle news dataframe
news = news.sample(frac=1)
news.head()

In [28]:
news.isnull().sum()

# EDA

In [20]:
# Distribution of classes in target
g = sns.countplot(news.is_fake)
g.set_xticklabels(['Real','Fake'])
g.set_xlabel('News')

In [21]:
(news["is_fake"].value_counts()).plot(kind = "pie", autopct='%1.1f')

52.3% of the news are fake and 47.7% are real. So, the classes are almost balanced. 

In [27]:
# Distribution of subject of news
news['subject'].value_counts().plot(kind = "bar")

# Text Pre-processing

In [29]:
# Preprocess the text with function processtext()

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
lemma = WordNetLemmatizer() # for Lemmatisation


def processtext(text):
    text=re.sub("[^a-zA-Z]"," ",text) # Filter to allow only alphabets in text
    text=text.lower() # Convert the text to lowercase to maintain consistency
    tokens=word_tokenize(text) # Tokenize the text
    # Remove stopwords and punctuations
    tokens=[token for token in tokens if token not in stop_words and token not in punctuations] 
    tokens=[lemma.lemmatize(token) for token in tokens] # Lemmatisation of tokens
    text=" ".join(tokens)
    return text


In [30]:
# Apply function processtext() to each text and save the new processed text in a new column "processed_text" 
news["processed_text"] = news.text.apply(lambda x: processtext(x))

In [31]:
news["processed_text"].head()

# Word Embeddings using SpaCy

I need to represent the processed text numerically. Instead of using the common approach of a bag of words representations for texts, I will first use word embeddings (or word vectors). SpaCy provides embeddings accessible by a large language model (en_core_web_lg).

In [32]:
# Load the large model to get the vectors
nlp = spacy.load('en_core_web_lg')

In [33]:
# Get the vectors for processed text, each processed text will be represented by a 300-dimensional vector
with nlp.disable_pipes():
    vectors = np.array([nlp(text).vector for text in news["processed_text"]])
    
vectors.shape

In [34]:
# Define X and y in order to fit ML models
X = vectors
y = news['is_fake'].values

from sklearn.model_selection import train_test_split
# split the data (X, y) to train-data and test-data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, random_state=0)

from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix

# Machine Learning models

In [35]:
def ML_models_performance(model, X_train, y_train ,X_test ,y_test, y_pred, model_name):
 
    performance_df=pd.DataFrame({'Train_accuracy':model.score(X_train,y_train),"Test_accuracy":model.score(X_test,y_test),
                       "Precision":precision_score(y_pred,y_test),"Recall":recall_score(y_pred,y_test),
                       "F1_Score":f1_score(y_pred,y_test)}, index=[model_name])
    return performance_df

In [36]:
from sklearn.linear_model import LogisticRegression
# Create the LogisticRegression model
lr = LogisticRegression(solver='sag')
# Fit the model
lr.fit(X_train, y_train)
# Use the trained model to predict
y_pred = lr.predict(X_test)
# model accuracy
print(f'Model train accuracy: {lr.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {lr.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [37]:
lr_performance = ML_models_performance(lr, X_train, y_train ,X_test ,y_test, y_pred, "Logisitc Regression")
lr_performance

In [39]:
from sklearn.svm import LinearSVC

# Create the LinearSVC model with some regularization
LSVC = LinearSVC(random_state=1, dual=False)
# Fit the model
LSVC.fit(X_train, y_train)
# Use the trained model to predict
y_pred = LSVC.predict(X_test)
# model accuracy
print(f'Model train accuracy: {LSVC.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {LSVC.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [40]:
LSVC_performance = ML_models_performance(LSVC, X_train, y_train ,X_test ,y_test, y_pred, "LinearSVC")
LSVC_performance

In [42]:
from sklearn.ensemble import RandomForestClassifier

# Create the RandomForestClassifier model
rfc = RandomForestClassifier(random_state=1)
# Fit the model
rfc.fit(X_train, y_train)
# Use the trained model to predict
y_pred = rfc.predict(X_test)
# model accuracy
print(f'Model train accuracy: {rfc.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {rfc.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [43]:
rfc_performance = ML_models_performance(rfc, X_train, y_train ,X_test ,y_test, y_pred, "Random Forest")
rfc_performance

In [44]:
comparison_df = pd.concat([lr_performance, LSVC_performance, rfc_performance])
comparison_df

All 3 ML models give good performance, among them Linear SVC is the best with accuracy 0.96, Precision 0.96, Recall 0.97, and F1_score 0.96. 

# Vectorization of text using TfidfVectorizer

Now I will change the strategy for the the featute engineering: Instead of using the word embeddings (or word vectors), I will use a bag of words representations for texts. In fact, I will convert a collection of text features to a matrix of word counts using TfidfVectorizer.


In [45]:
# text features
X = news["processed_text"]

# convert a collection of text features to a matrix of word counts
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X = (tfidf.fit_transform(X)).toarray()

# split the data (X, y) to train-data and test-data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, random_state=0)

In [46]:
# Create the LogisticRegression model
lr = LogisticRegression(solver='sag')
# Fit the model
lr.fit(X_train, y_train)
# Use the trained model to predict
y_pred = lr.predict(X_test)
# model accuracy
print(f'Model train accuracy: {lr.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {lr.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [47]:
lr_performance = ML_models_performance(lr, X_train, y_train ,X_test ,y_test, y_pred, "Logisitc Regression")
lr_performance

In [48]:
# Create the LinearSVC model
LSVC = LinearSVC(random_state=1, dual=False)
# Fit the model
LSVC.fit(X_train, y_train)
# Use the trained model to predict
y_pred = LSVC.predict(X_test)
# model accuracy
print(f'Model train accuracy: {LSVC.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {LSVC.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [49]:
LSVC_performance = ML_models_performance(LSVC, X_train, y_train ,X_test ,y_test, y_pred, "LinearSVC")
LSVC_performance

In [50]:
# Create the RandomForestClassifier model
rfc = RandomForestClassifier(random_state=1)
# Fit the model
rfc.fit(X_train, y_train)
# Use the trained model to predict
y_pred = rfc.predict(X_test)
# model accuracy
print(f'Model train accuracy: {rfc.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {rfc.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [51]:
rfc_performance = ML_models_performance(rfc, X_train, y_train ,X_test ,y_test, y_pred, "Random Forest")
rfc_performance

In [52]:
comparison_df = pd.concat([lr_performance, LSVC_performance, rfc_performance])
comparison_df

It seems using the bag of words representation of texts plus using the Random Forest model gives a good performance with accuracy of 0.997, Precision 0.997, Recall 0.998, and and f1_score 0.997. 