Detecting Deceit - Training the model to predict when there is fake news on social media

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import pickle


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

Reading in the raw data

In [2]:
news_df = pd.read_excel('../data/raw/News_Detection_data.xlsx')

Removing any records that are blank from the dataset and reset the index for further preprocessing.

In [3]:
news_df.dropna(inplace = True)

news_df.reset_index(drop=True, inplace=True)

Removing numbers, brackets and invalid characters

In [4]:
news_df['Title'] = news_df['Title'].str.replace('\d+', '', regex=True)
news_df['Title'] = news_df['Title'].str.replace(r'\(|\)', '', regex=True)
news_df['Title'] = news_df['Title'].str.replace("â€™", "", regex=True)
news_df['Title'] = news_df['Title'].str.replace("Ã¢", "", regex=True)


Function to be used to remove punctation from the tweets then applying the function to the text

In [5]:
import string 

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

news_df['Title']= news_df['Title'].apply(lambda x:remove_punctuation(x))

Using a function to tokenize the text then applying the function to the data and creating a new series in the dataframe

In [6]:
import re

def tokenize_words(text):
    words = re.split(r'\W+',text)
    return words


news_df['Title_Tokenized']= news_df['Title'].apply(lambda x: tokenize_words(x))

Lowering the case of the characters for the purposes of removing the stopwords due to case sensitivity. 

In [7]:
for i in range(len(news_df['Title_Tokenized'])):
    news_df['Title_Tokenized'][i] = [token.lower() for token in news_df['Title_Tokenized'][i]]

Removal of Stop Words and completing text lemmatization

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

stop_words = set(stopwords.words('english')) 
lemma_word_list = []
wordnet_lemmatizer = WordNetLemmatizer()
    

for tokenized_data in news_df['Title_Tokenized']:
    filtered_data = [x for x in tokenized_data if x.lower() not in stop_words]
    
    lemma_words = []

    for w in filtered_data:
        word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
        word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
        word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
        lemma_words.append(word3)
    lemma_word_list.append(lemma_words)


[nltk_data] Downloading package stopwords to C:\Users\User
[nltk_data]     1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
news_df["Title_Text"] = lemma_word_list

Removing the columns that will not be part of the model

In [10]:
news_df = news_df.drop(['Title_Tokenized', 'Title'], axis = 1)

OHE (one hot encoding) for the labelling if the tweet from GossipCop is real or fake news. 1 will identify the news that is real and 0 will identify the fake news

In [11]:
news_df['Label'] = pd.get_dummies(news_df['Label'], drop_first=True)

Creating a copy of the dataframe and renaming it for the training and testing of the model. Also, saving a copy of the dataframe that will be used. 

In [12]:
Processed_News_Data = news_df

In [13]:
Processed_News_Data.to_excel("../data/processed/Processed_News_Data.xlsx", index=False)

Splitting the data for training with 70% of the data, and testing will be completed with 30%. Saving the separated training and testing data as their own datasets. 

In [14]:
train_df, test_df = train_test_split(Processed_News_Data, test_size=0.3, random_state=101)

train_df.to_csv('../data/raw/train.csv', index=False)
test_df.to_csv('../data/raw/test.csv', index=False)

Brining the training data into the workbook for the purposes of training the model.

In [15]:
training_data = pd.read_csv('../data/raw/train.csv')

Creation of the Tfidf Vectorizer instance, then using the fit which will convert to numereical format so it can be used in the model. Then transform which will create the Tfidf matrix based on the what was learned in the fit.

In [16]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(training_data['Title_Text'])

In [17]:
filename = "../models/tf_processor.pkl"
with open(filename, 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)

In [18]:
y_train = training_data['Label']

Parameteres for SVM and Random Forest

In [19]:
svm_model = SVC(class_weight='balanced', probability=True)
svm_param_grid = {
    "C":[5,10,20],
    "kernel":['linear']    
}

rf_model = RandomForestClassifier(class_weight='balanced')
rf_param_grid = {
    "n_estimators":[10,25,50],
    "max_depth":[None],
    "min_samples_split":[2,5,10],
    "min_samples_leaf":[1,2,4]
}

In [20]:
# SUPPORT VECTOR MACHINE, RANDOM FOREST WITH GRID SEARC

svm_grid_search = GridSearchCV(svm_model,param_grid=svm_param_grid, cv=7, n_jobs=-1)
rf_grid_search = GridSearchCV(rf_model, param_grid=rf_param_grid, cv=7, n_jobs=-1)


In [21]:
svm_grid_search.fit(X_train_tfidf ,y_train)   
rf_grid_search.fit(X_train_tfidf ,y_train)   

In [22]:
svm_best_model = svm_grid_search.best_estimator_
rf_best_model = rf_grid_search.best_estimator_

In [23]:
filename = "../models/svm_best_model.pkl"
with open(filename, 'wb') as vectorizer_file:
    pickle.dump(svm_best_model, vectorizer_file)
    

filename = "../models/rf_best_model.pkl"
with open(filename, 'wb') as vectorizer_file:
    pickle.dump(rf_best_model, vectorizer_file)
      
    

In [24]:
svm_best_model

In [25]:
rf_best_model

In [26]:
def create_ensemble(X):
    svm_preds= svm_best_model.predict(X)
    rf_preds= rf_best_model.predict(X)
    ensemble_preds= [1 if (svm_pred + rf_pred) >= 1 
                     else 0 for svm_pred, rf_pred in zip(svm_preds, rf_preds)]
    return ensemble_preds

In [27]:
ensemble_training_pred = create_ensemble(X_train_tfidf)
print(classification_report(y_train,ensemble_training_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      3752
           1       0.99      0.99      0.99     11737

    accuracy                           0.99     15489
   macro avg       0.99      0.98      0.98     15489
weighted avg       0.99      0.99      0.99     15489



In [30]:
accuracy = accuracy_score(y_train, ensemble_training_pred)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.99


In [28]:
training_predictions = pd.DataFrame({'Predictions': ensemble_training_pred})

In [29]:
training_predictions.to_excel("../results/training_prediction.xlsx", index=False)