In [59]:
# import all necessary libraries

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectKBest
import gensim
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import spacy
from collections import defaultdict
import tensorflow_hub as hub



In [28]:
# load the data

df_fake = pd.read_csv('data/Fake.csv')
df_real = pd.read_csv('data/True.csv')

# label the data, 1 for fake news, 0 for real news
df_fake['label'] = 1
df_real['label'] = 0

# combine the data
df = pd.concat([df_fake, df_real], axis=0).reset_index(drop=True)

# drop useless columns
df = df.drop(['subject', 'date', 'text'], axis=1)

# check the data validity
print(df.head())
print(df.isnull().sum())

# load another test data
df_fnn = pd.read_csv('data/FakeNewsNet.csv')
df_fnn = df_fnn.drop(['news_url', 'source_domain', 'tweet_num'], axis=1)

# rename 'real' column to 'label' column
df_fnn = df_fnn.rename(columns={'real': 'label'})

print(df_fnn.isnull().sum())


                                               title  label
0   Donald Trump Sends Out Embarrassing New Year’...      1
1   Drunk Bragging Trump Staffer Started Russian ...      1
2   Sheriff David Clarke Becomes An Internet Joke...      1
3   Trump Is So Obsessed He Even Has Obama’s Name...      1
4   Pope Francis Just Called Out Donald Trump Dur...      1
title    0
label    0
dtype: int64
title    0
label    0
dtype: int64


In [29]:
# data preprocessing

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# customize NER
nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler", before="ner")

# add patterns for donald trump, hillary clinton, barack obama, joe biden, democrat, republican
patterns = [
    {"label": "PERSON", "pattern": [{"LOWER": "donald"}, {"LOWER": "trump"}], "id": "donald_trump"},
    {"label": "PERSON", "pattern": [{"LOWER": "trump"}], "id": "donald_trump"},
    {"label": "PERSON", "pattern": [{"LOWER": "donald"}], "id": "donald_trump"},
    {"label": "PERSON", "pattern": [{"LOWER": "hillary"}, {"LOWER": "clinton"}], "id": "hillary_clinton"},
    {"label": "PERSON", "pattern": [{"LOWER": "clinton"}], "id": "hillary_clinton"},
    {"label": "PERSON", "pattern": [{"LOWER": "hillary"}], "id": "hillary_clinton"},
    {"label": "PERSON", "pattern": [{"LOWER": "barack"}, {"LOWER": "obama"}], "id": "barack_obama"},
    {"label": "PERSON", "pattern": [{"LOWER": "obama"}], "id": "barack_obama"},
    {"label": "PERSON", "pattern": [{"LOWER": "obamas"}], "id": "barack_obama"},
    {"label": "PERSON", "pattern": [{"LOWER": "barack"}], "id": "barack_obama"},
    {"label": "PERSON", "pattern": [{"LOWER": "joe"}, {"LOWER": "biden"}], "id": "joe_biden"},
    {"label": "PERSON", "pattern": [{"LOWER": "biden"}], "id": "joe_biden"},
    {"label": "PERSON", "pattern": [{"LOWER": "joe"}], "id": "joe_biden"},
    {"label": "ORG", "pattern": [{"LOWER": "democrat"}], "id": "democrat"},
    {"label": "ORG", "pattern": [{"LOWER": "democrats"}], "id": "democrat"},
    {"label": "ORG", "pattern": [{"LOWER": "republican"}], "id": "republican"},
    {"label": "ORG", "pattern": [{"LOWER": "republicans"}], "id": "republican"},
    {"label": "ORG", "pattern": [{"LOWER": "gop"}], "id": "republican"},
]
ruler.add_patterns(patterns)

counter = defaultdict(int)

def preprocess_text_with_ner(text):
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # tokenize
    tokens = []
    words = nlp(text)
    
    # remove stopwords and lemmatize words
    # substitute named entities with their labels for person, organization, date, time, and location
    for word in words:
        if word.is_stop or len(word.text)<=1 or word.is_space:
            continue
        if word.ent_type_ in ['PERSON', 'ORG', 'DATE', 'TIME', 'GPE']:
            tokens.append("<" + word.ent_type_ + ">")
        else:
            tokens.append(word.lemma_)
            counter[word.lemma_] += 1
    text = ' '.join(tokens)
    return text

def preprocess_text(text):
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove D
    text = re.sub(r'[^\w\s]', '', text)
    # tokenize
    words = nltk.word_tokenize(text)
    # remove stopwords and lemmatize words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if (word not in stop_words and word != '')]
    text = ' '.join(lemmatized_words)
    return text

df['title_clean'] = df['title'].apply(preprocess_text)
df['title_clean_ner'] = df['title'].apply(preprocess_text_with_ner)

df_fnn['title_clean'] = df_fnn['title'].apply(preprocess_text)
df_fnn['title_clean_ner'] = df_fnn['title'].apply(preprocess_text_with_ner)



In [30]:
# save current df to a file for later use
df.to_csv('data/processed_data.csv', index=False)
df_fnn.to_csv('data/processed_data_fnn.csv', index=False)


In [36]:
# load the processed data
df = pd.read_csv('data/processed_data.csv')
df_fnn = pd.read_csv('data/processed_data_fnn.csv')

# drop rows with empty value
df = df.dropna()
df_fnn = df_fnn.dropna()


In [37]:
# common function to train and test models
# use method to specify the feature extraction method
def train_and_test(method, X_train, X_test, y_train, y_test):
    models = [
        LogisticRegression(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        SVC(),
    ]
    
    print(f'Feature selection method: {method}')

    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{model.__class__.__name__}: {accuracy*100:.2f}")
        report = classification_report(y_test, y_pred)
        print(report)
        print("-"*30)

In [46]:
# Use Bag of Words and TF-IDF to extract features
def train_and_test_bow(title_column, k):
    # For title_clean
    X = df[title_column]
    y = df['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5664)
    
    # Bag of Words
    vectorizer = CountVectorizer()
    X_train_bow = vectorizer.fit_transform(X_train)
    
    # perform chi2 feature selection and select top 100 features
    selector = SelectKBest(chi2, k=k)
    X_train_bow = selector.fit_transform(X_train_bow, y_train)
    
    X_test_bow = vectorizer.transform(X_test)
    X_test_bow = selector.transform(X_test_bow)
    
    print(f'For {title_column} (original data):')
    train_and_test('Bag of Words', X_train_bow, X_test_bow, y_train, y_test)
    
    # use same model and test on df_fnn
    X_test = df_fnn[title_column]
    y_test = df_fnn['label']
    
    X_test_bow = vectorizer.transform(X_test)
    X_test_bow = selector.transform(X_test_bow)
    
    print(f'For {title_column} (FakeNewsNet data):')
    train_and_test('Bag of Words', X_train_bow, X_test_bow, y_train, y_test)


In [48]:
# TF-IDF
def train_and_test_tfidf(title_column, k):
    X = df[title_column]
    y = df['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5664)
    
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    # perform chi2 feature selection and select top 100 features
    selector = SelectKBest(chi2, k=k)
    X_train_tfidf = selector.fit_transform(X_train_tfidf, y_train)
    
    X_test_tfidf = vectorizer.transform(X_test)
    X_test_tfidf = selector.transform(X_test_tfidf)
    
    print(f'For {title_column} (original data):')
    train_and_test('TF-IDF', X_train_tfidf, X_test_tfidf, y_train, y_test)
    
    # use same model and test on df_fnn
    X_test = df_fnn[title_column]
    y_test = df_fnn['label']
    
    X_test_tfidf = vectorizer.transform(X_test)
    X_test_tfidf = selector.transform(X_test_tfidf)
    
    print(f'For {title_column} (FakeNewsNet data):')
    train_and_test('TF-IDF', X_train_tfidf, X_test_tfidf, y_train, y_test)



In [51]:
# test with title_clean
train_and_test_bow('title_clean', 100)
train_and_test_bow('title_clean', 300)
train_and_test_tfidf('title_clean', 100)
train_and_test_tfidf('title_clean', 300)

For title_clean (original data):
Feature selection method: Bag of Words
LogisticRegression: 84.52
              precision    recall  f1-score   support

           0       0.78      0.93      0.85      4258
           1       0.93      0.77      0.84      4720

    accuracy                           0.85      8978
   macro avg       0.85      0.85      0.84      8978
weighted avg       0.86      0.85      0.84      8978

------------------------------
DecisionTreeClassifier: 84.25
              precision    recall  f1-score   support

           0       0.78      0.93      0.85      4258
           1       0.93      0.76      0.84      4720

    accuracy                           0.84      8978
   macro avg       0.85      0.85      0.84      8978
weighted avg       0.86      0.84      0.84      8978

------------------------------
RandomForestClassifier: 84.61
              precision    recall  f1-score   support

           0       0.78      0.93      0.85      4258
           1     

In [52]:
# test with title_clean_ner
train_and_test_bow('title_clean_ner', 100)
train_and_test_bow('title_clean_ner', 300)
train_and_test_tfidf('title_clean_ner', 100)
train_and_test_tfidf('title_clean_ner', 300)

For title_clean_ner (original data):
Feature selection method: Bag of Words
LogisticRegression: 84.36
              precision    recall  f1-score   support

           0       0.79      0.91      0.85      4258
           1       0.90      0.79      0.84      4720

    accuracy                           0.84      8978
   macro avg       0.85      0.85      0.84      8978
weighted avg       0.85      0.84      0.84      8978

------------------------------
DecisionTreeClassifier: 83.45
              precision    recall  f1-score   support

           0       0.78      0.90      0.84      4258
           1       0.90      0.78      0.83      4720

    accuracy                           0.83      8978
   macro avg       0.84      0.84      0.83      8978
weighted avg       0.84      0.83      0.83      8978

------------------------------
RandomForestClassifier: 84.16
              precision    recall  f1-score   support

           0       0.79      0.90      0.84      4258
           1 

In [53]:
# use word embeddings to extract features
word2vec_model = gensim.models.KeyedVectors.load('word2vec-google-news-300.bin')

def get_word_vector(word):
    try:
        return word2vec_model[word]
    except:
        return np.zeros(300)

def get_sentence_vector(sentence):
    words = sentence.split()
    vectors = [get_word_vector(word) for word in words]
    return np.mean(vectors, axis=0)

def train_and_test_word_embedding(title_column):
    X = df[title_column]
    y = df['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5664)
    
    X_train_word_embedding = np.array([get_sentence_vector(sentence) for sentence in X_train])
    X_test_word_embedding = np.array([get_sentence_vector(sentence) for sentence in X_test])
    
    print(f'For {title_column} (original data):')
    train_and_test('Word Embedding', X_train_word_embedding, X_test_word_embedding, y_train, y_test)
    
    # use same model and test on df_fnn
    X_test = df_fnn[title_column]
    y_test = df_fnn['label']
    
    X_test_word_embedding = np.array([get_sentence_vector(sentence) for sentence in X_test])
    
    print(f'For {title_column} (FakeNewsNet data):')
    train_and_test('Word Embedding', X_train_word_embedding, X_test_word_embedding, y_train, y_test)
    

In [54]:
# test with title_clean
train_and_test_word_embedding('title_clean')

For title_clean (original data):
Feature selection method: Word Embedding
LogisticRegression: 90.59
              precision    recall  f1-score   support

           0       0.90      0.91      0.90      4258
           1       0.91      0.91      0.91      4720

    accuracy                           0.91      8978
   macro avg       0.91      0.91      0.91      8978
weighted avg       0.91      0.91      0.91      8978

------------------------------
DecisionTreeClassifier: 80.74
              precision    recall  f1-score   support

           0       0.81      0.77      0.79      4258
           1       0.80      0.84      0.82      4720

    accuracy                           0.81      8978
   macro avg       0.81      0.81      0.81      8978
weighted avg       0.81      0.81      0.81      8978

------------------------------
RandomForestClassifier: 91.18
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      4258
           1   

In [55]:
# test with title_clean_ner
train_and_test_word_embedding('title_clean_ner')

For title_clean_ner (original data):
Feature selection method: Word Embedding
LogisticRegression: 87.47
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      4258
           1       0.89      0.87      0.88      4720

    accuracy                           0.87      8978
   macro avg       0.87      0.87      0.87      8978
weighted avg       0.87      0.87      0.87      8978

------------------------------
DecisionTreeClassifier: 79.87
              precision    recall  f1-score   support

           0       0.80      0.76      0.78      4258
           1       0.79      0.83      0.81      4720

    accuracy                           0.80      8978
   macro avg       0.80      0.80      0.80      8978
weighted avg       0.80      0.80      0.80      8978

------------------------------
RandomForestClassifier: 89.53
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      4258
           

In [56]:
# use glove word embeddings to extract features
glove_model = gensim.models.KeyedVectors.load('glove-twitter-200.bin')

def get_glove_vector(word):
    try:
        return glove_model[word]
    except:
        return np.zeros(200)
    
def get_glove_sentence_vector(sentence):
    words = sentence.split()
    vectors = [get_glove_vector(word) for word in words]
    return np.mean(vectors, axis=0)

def train_and_test_glove_embedding(title_column):
    X = df[title_column]
    y = df['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5664)
    
    X_train_glove_embedding = np.array([get_glove_sentence_vector(sentence) for sentence in X_train])
    X_test_glove_embedding = np.array([get_glove_sentence_vector(sentence) for sentence in X_test])
    
    print(f'For {title_column} (original data):')
    train_and_test('Glove Embedding', X_train_glove_embedding, X_test_glove_embedding, y_train, y_test)
    
    # use same model and test on df_fnn
    X_test = df_fnn[title_column]
    y_test = df_fnn['label']
    
    X_test_glove_embedding = np.array([get_glove_sentence_vector(sentence) for sentence in X_test])
    
    print(f'For {title_column} (FakeNewsNet data):')
    train_and_test('Glove Embedding', X_train_glove_embedding, X_test_glove_embedding, y_train, y_test)


In [57]:
# test with title_clean
train_and_test_glove_embedding('title_clean')

For title_clean (original data):
Feature selection method: Glove Embedding
LogisticRegression: 90.05
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      4258
           1       0.91      0.90      0.91      4720

    accuracy                           0.90      8978
   macro avg       0.90      0.90      0.90      8978
weighted avg       0.90      0.90      0.90      8978

------------------------------
DecisionTreeClassifier: 82.35
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      4258
           1       0.82      0.85      0.84      4720

    accuracy                           0.82      8978
   macro avg       0.82      0.82      0.82      8978
weighted avg       0.82      0.82      0.82      8978

------------------------------
RandomForestClassifier: 90.90
              precision    recall  f1-score   support

           0       0.89      0.92      0.91      4258
           1  

In [58]:
# test with title_clean_ner
train_and_test_glove_embedding('title_clean_ner')

For title_clean_ner (original data):
Feature selection method: Glove Embedding
LogisticRegression: 87.45
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      4258
           1       0.88      0.88      0.88      4720

    accuracy                           0.87      8978
   macro avg       0.87      0.87      0.87      8978
weighted avg       0.87      0.87      0.87      8978

------------------------------
DecisionTreeClassifier: 79.36
              precision    recall  f1-score   support

           0       0.80      0.75      0.78      4258
           1       0.79      0.83      0.81      4720

    accuracy                           0.79      8978
   macro avg       0.79      0.79      0.79      8978
weighted avg       0.79      0.79      0.79      8978

------------------------------
RandomForestClassifier: 89.15
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      4258
          

In [61]:
# use Sentence Vectors embeddings to extract features
use_model = hub.load('universal-sentence-encoder')

def get_use_vector(sentence):
    return use_model([sentence])[0].numpy()

def train_and_test_use_embedding(title_column):
    X = df[title_column]
    y = df['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5664)
    
    X_train_use_embedding = np.array([get_use_vector(sentence) for sentence in X_train])
    X_test_use_embedding = np.array([get_use_vector(sentence) for sentence in X_test])
    
    print(f'For {title_column} (original data):')
    train_and_test('USE Embedding', X_train_use_embedding, X_test_use_embedding, y_train, y_test)
    
    # use same model and test on df_fnn
    X_test = df_fnn[title_column]
    y_test = df_fnn['label']
    
    X_test_use_embedding = np.array([get_use_vector(sentence) for sentence in X_test])
    
    print(f'For {title_column} (FakeNewsNet data):')
    train_and_test('USE Embedding', X_train_use_embedding, X_test_use_embedding, y_train, y_test)
    

In [62]:
# test with title_clean
train_and_test_use_embedding('title_clean')

For title_clean (original data):
Feature selection method: USE Embedding
LogisticRegression: 91.15
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      4258
           1       0.92      0.91      0.92      4720

    accuracy                           0.91      8978
   macro avg       0.91      0.91      0.91      8978
weighted avg       0.91      0.91      0.91      8978

------------------------------
DecisionTreeClassifier: 84.65
              precision    recall  f1-score   support

           0       0.86      0.81      0.83      4258
           1       0.84      0.88      0.86      4720

    accuracy                           0.85      8978
   macro avg       0.85      0.84      0.85      8978
weighted avg       0.85      0.85      0.85      8978

------------------------------
RandomForestClassifier: 92.31
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      4258
           1    

In [63]:
# test with title_clean_ner
train_and_test_use_embedding('title_clean_ner')

For title_clean_ner (original data):
Feature selection method: USE Embedding
LogisticRegression: 89.27
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      4258
           1       0.91      0.89      0.90      4720

    accuracy                           0.89      8978
   macro avg       0.89      0.89      0.89      8978
weighted avg       0.89      0.89      0.89      8978

------------------------------
DecisionTreeClassifier: 82.90
              precision    recall  f1-score   support

           0       0.83      0.80      0.82      4258
           1       0.83      0.85      0.84      4720

    accuracy                           0.83      8978
   macro avg       0.83      0.83      0.83      8978
weighted avg       0.83      0.83      0.83      8978

------------------------------
RandomForestClassifier: 90.54
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      4258
           1

In [70]:
# use VAD score to extract features

# load NRC VAD lexicon
nrc_vad = pd.read_csv('NRC-VAD-Lexicon.txt', sep='\t')
nrc_vad.columns = ['word', 'valence', 'arousal', 'dominance']

# create dictionary for VAD scores
vad_dict = {}
for i in range(len(nrc_vad)):
    word = nrc_vad.iloc[i]['word']
    valence = nrc_vad.iloc[i]['valence']
    arousal = nrc_vad.iloc[i]['arousal']
    dominance = nrc_vad.iloc[i]['dominance']
    vad_dict[word] = (valence, arousal, dominance)


In [71]:
vad_scores_title_original = []

for title in df['title_clean']:
    valence = 0
    arousal = 0
    dominance = 0
    words = title.split()
    word_count = 0
    for word in words:
        if word in vad_dict:
            valence += vad_dict[word][0]
            arousal += vad_dict[word][1]
            dominance += vad_dict[word][2]
            word_count += 1
    
    if word_count == 0:
        vad_scores_title_original.append((0, 0, 0))
        continue
    valence /= word_count
    arousal /= word_count
    dominance /= word_count
    vad_scores_title_original.append((valence, arousal, dominance))

df['title_clean_valence'] = [vad_scores_title_original[i][0] for i in range(len(vad_scores_title_original))]
df['title_clean_arousal'] = [vad_scores_title_original[i][1] for i in range(len(vad_scores_title_original))]
df['title_clean_dominance'] = [vad_scores_title_original[i][2] for i in range(len(vad_scores_title_original))]


vad_scores_title_ner_original = []

for title in df['title_clean_ner']:
    valence = 0
    arousal = 0
    dominance = 0
    words = title.split()
    word_count = 0
    for word in words:
        if word in vad_dict:
            valence += vad_dict[word][0]
            arousal += vad_dict[word][1]
            dominance += vad_dict[word][2]
            word_count += 1
    
    if word_count == 0:
        vad_scores_title_ner_original.append((0, 0, 0))
        continue
    valence /= word_count
    arousal /= word_count
    dominance /= word_count
    vad_scores_title_ner_original.append((valence, arousal, dominance))
    
df['vad_title_ner_valence'] = [vad_scores_title_ner_original[i][0] for i in range(len(vad_scores_title_ner_original))]
df['vad_title_ner_arousal'] = [vad_scores_title_ner_original[i][1] for i in range(len(vad_scores_title_ner_original))]
df['vad_title_ner_dominance'] = [vad_scores_title_ner_original[i][2] for i in range(len(vad_scores_title_ner_original))]

In [78]:
# apply VAD score to df_fnn

vad_scores_title_fnn = []

for title in df_fnn['title_clean']:
    valence = 0
    arousal = 0
    dominance = 0
    words = title.split()
    word_count = 0
    for word in words:
        if word in vad_dict:
            valence += vad_dict[word][0]
            arousal += vad_dict[word][1]
            dominance += vad_dict[word][2]
            word_count += 1
    
    if word_count == 0:
        vad_scores_title_fnn.append((0, 0, 0))
        continue
    valence /= word_count
    arousal /= word_count
    dominance /= word_count
    vad_scores_title_fnn.append((valence, arousal, dominance))
    
df_fnn['title_clean_valence'] = [vad_scores_title_fnn[i][0] for i in range(len(vad_scores_title_fnn))]
df_fnn['title_clean_arousal'] = [vad_scores_title_fnn[i][1] for i in range(len(vad_scores_title_fnn))]
df_fnn['title_clean_dominance'] = [vad_scores_title_fnn[i][2] for i in range(len(vad_scores_title_fnn))]


vad_scores_title_fnn_ner = []

for title in df_fnn['title_clean_ner']:
    valence = 0
    arousal = 0
    dominance = 0
    words = title.split()
    word_count = 0
    for word in words:
        if word in vad_dict:
            valence += vad_dict[word][0]
            arousal += vad_dict[word][1]
            dominance += vad_dict[word][2]
            word_count += 1
    
    if word_count == 0:
        vad_scores_title_fnn_ner.append((0, 0, 0))
        continue
    valence /= word_count
    arousal /= word_count
    dominance /= word_count
    vad_scores_title_fnn_ner.append((valence, arousal, dominance))

df_fnn['vad_title_ner_valence'] = [vad_scores_title_fnn_ner[i][0] for i in range(len(vad_scores_title_fnn_ner))]
df_fnn['vad_title_ner_arousal'] = [vad_scores_title_fnn_ner[i][1] for i in range(len(vad_scores_title_fnn_ner))]
df_fnn['vad_title_ner_dominance'] = [vad_scores_title_fnn_ner[i][2] for i in range(len(vad_scores_title_fnn_ner))]


In [74]:
# train and test with VAD score for original data
X = df[['title_clean_valence', 'title_clean_arousal', 'title_clean_dominance']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5664)

train_and_test('VAD Score', X_train, X_test, y_train, y_test)

# use fnn data as test data
X_test = df_fnn[['title_clean_valence', 'title_clean_arousal', 'title_clean_dominance']]
y_test = df_fnn['label']

print('For title_clean (FakeNewsNet data):')

train_and_test('VAD Score', X_train, X_test, y_train, y_test)


Feature selection method: VAD Score
LogisticRegression: 64.35
              precision    recall  f1-score   support

           0       0.63      0.59      0.61      4258
           1       0.65      0.69      0.67      4720

    accuracy                           0.64      8978
   macro avg       0.64      0.64      0.64      8978
weighted avg       0.64      0.64      0.64      8978

------------------------------
DecisionTreeClassifier: 65.73
              precision    recall  f1-score   support

           0       0.65      0.60      0.62      4258
           1       0.66      0.71      0.68      4720

    accuracy                           0.66      8978
   macro avg       0.66      0.65      0.65      8978
weighted avg       0.66      0.66      0.66      8978

------------------------------
RandomForestClassifier: 70.35
              precision    recall  f1-score   support

           0       0.70      0.65      0.68      4258
           1       0.71      0.75      0.73      4720

In [79]:
# train and test with VAD score for original data with ner
X = df[['vad_title_ner_valence', 'vad_title_ner_arousal', 'vad_title_ner_dominance']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5664)

train_and_test('VAD Score with NER', X_train, X_test, y_train, y_test)

# use fnn data as test data
X_test = df_fnn[['vad_title_ner_valence', 'vad_title_ner_arousal', 'vad_title_ner_dominance']]
y_test = df_fnn['label']

print('For title_clean_ner (FakeNewsNet data):')

train_and_test('VAD Score with NER', X_train, X_test, y_train, y_test)


Feature selection method: VAD Score with NER
LogisticRegression: 63.71
              precision    recall  f1-score   support

           0       0.62      0.59      0.61      4258
           1       0.65      0.68      0.66      4720

    accuracy                           0.64      8978
   macro avg       0.64      0.63      0.63      8978
weighted avg       0.64      0.64      0.64      8978

------------------------------
DecisionTreeClassifier: 65.48
              precision    recall  f1-score   support

           0       0.65      0.60      0.62      4258
           1       0.66      0.70      0.68      4720

    accuracy                           0.65      8978
   macro avg       0.65      0.65      0.65      8978
weighted avg       0.65      0.65      0.65      8978

------------------------------
RandomForestClassifier: 69.74
              precision    recall  f1-score   support

           0       0.69      0.65      0.67      4258
           1       0.70      0.74      0.72 