In [1]:
import json
import ast
import csv
import random
import pandas as pd
import sklearn
import nltk
import torch
import torchvision
import tensorflow
import re
import string
import numpy as np
import matplotlib.pyplot as plt
import time
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim import corpora, models
from sklearn.preprocessing import LabelEncoder
from keybert import KeyBERT

## Data

In [2]:
#liar plus data
liar_plus_train = pd.read_csv('Data/Liar_plus/train.tsv', delimiter='\t', header = None, quoting=csv.QUOTE_NONE)
liar_plus_train.columns = ["ID", "Json_File_ID", "Truth_Label", "Statement", "Subject", "Speaker", "Speakers_Job", "State", "Party",
                      "Barely_True_Counts", "False_Counts", "Half_True_Counts", "Mostly_True_Counts", "Pants_On_Fire_Counts", "Context_Venue_Location", "Justification"]
liar_plus_train = liar_plus_train.drop(columns='ID')
liar_plus_train = liar_plus_train.dropna(subset=['Statement'])

In [3]:
liar_plus_train.head()

Unnamed: 0,Json_File_ID,Truth_Label,Statement,Subject,Speaker,Speakers_Job,State,Party,Barely_True_Counts,False_Counts,Half_True_Counts,Mostly_True_Counts,Pants_On_Fire_Counts,Context_Venue_Location,Justification
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"""Surovell said the decline of coal """"started w..."
2,324.json,mostly-true,"""Hillary Clinton agrees with John McCain """"by ...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,"""Obama said he would have voted against the am..."
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,"""The release may have a point that Mikulskis c..."
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"""Crist said that the economic """"turnaround sta..."


In [4]:
liar_plus_train.dtypes

Json_File_ID               object
Truth_Label                object
Statement                  object
Subject                    object
Speaker                    object
Speakers_Job               object
State                      object
Party                      object
Barely_True_Counts        float64
False_Counts              float64
Half_True_Counts          float64
Mostly_True_Counts        float64
Pants_On_Fire_Counts      float64
Context_Venue_Location     object
Justification              object
dtype: object

In [5]:
#liar plus with additional ratings column
with_ratings = pd.read_csv('Data/politifact_data_combined_prev_ratings.csv')
with_ratings.head()

Unnamed: 0,media,when/where,content,label,speaker,documented_time,percentages,check_nums,summaries,article
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",“Haaretz investigation reveals discrepancies i...,FALSE,Madison Czopek,"October 31, 2023",['0%' '0%' '2%' '7%' '67%' '21%'],[ 5 3 16 54 473 152],"['Haaretz, an Israeli newspaper, said on X tha...",A viral Oct. 28 social media post claimed that...
1,Scott Walker,"stated on May 30, 2023 in Interview:",“Wisconsin has historically … and I think larg...,barely-true,Laura Schulte,"October 31, 2023",['12%' '21%' '18%' '19%' '21%' '5%'],[26 45 39 41 44 11],['Although Wisconsin has voted for more Democr...,"In 2016, Wisconsin helped to swing the preside..."
2,Instagram posts,"stated on October 27, 2023 in a post:","“The airport in Salzburg, Austria, has a count...",FALSE,Ciara O'Rourke,"October 30, 2023",['0%' '0%' '2%' '7%' '67%' '21%'],[ 5 3 16 54 473 152],[],A social media post poised to encourage people...
3,Viral image,"stated on October 27, 2023 in an Instagram post:",Video shows Palestinians pretending to be corp...,FALSE,Ciara O'Rourke,"October 30, 2023",['0%' '1%' '2%' '4%' '62%' '28%'],[ 4 13 35 53 745 336],['This video is 10 years old and shows student...,The Gaza Health Ministry has said the Palestin...
4,Facebook posts,"stated on September 25, 2023 in a Facebook post:",The life span of a wind tower generator lasts ...,FALSE,Loreben Tuquero,"October 30, 2023",['0%' '1%' '4%' '9%' '59%' '23%'],[ 24 50 108 247 1519 594],['A study by energy industry experts showed th...,Let’s clear the air. Do wind turbine component...


### n-gram and LDA

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    return text

In [7]:
def get_ngram_embeddings(text, n, model):
    words = text.split()
    ngrams = [words[i:i + n] for i in range(len(words) - n + 1)]  
    embeddings = [model.wv[gram] for gram in ngrams if all(word in model.wv for word in gram)]
    return embeddings

In [8]:
def get_word_embeddings(tokens, model):
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

In [9]:
#creating ngram model for article headline
liar_plus_train['Statement'] = liar_plus_train['Statement'].apply(preprocess_text)
headline_split = [text.split() for text in liar_plus_train['Statement']] 
model_headline = Word2Vec(headline_split, vector_size=100, window=6, min_count=2)
model_headline.save("word2vec.model_headline")

In [10]:
liar_plus_train['bi-gram-embeddings-statement'] = liar_plus_train['Statement'].apply(lambda x: get_ngram_embeddings(x, 2, model_headline))
liar_plus_train['quad-gram-embeddings-statement'] = liar_plus_train['Statement'].apply(lambda x: get_ngram_embeddings(x, 4, model_headline))

In [11]:
#LDA
documents = liar_plus_train['Statement'].apply(preprocess_text)
texts = [document.split() for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model_body = models.LdaModel(corpus, num_topics=30, id2word=dictionary, passes=15)

topic_distributions = [lda_model_body[doc] for doc in corpus]
dominant_topics = []

for topic_dist in topic_distributions:
# Sort topics by probability and get the most likely topic
    dominant_topic = max(topic_dist, key=lambda item: item[1])
    dominant_topics.append(dominant_topic)

# Extract the topic number and probability from the dominant topic
topic_numbers = [topic[0] for topic in dominant_topics]
topic_probabilities = [topic[1] for topic in dominant_topics]

# Add the topic information as new columns
liar_plus_train['DominantTopicBody'] = topic_numbers
liar_plus_train['TopicProbabilityBody'] = topic_probabilities

In [12]:
#tokenizing text columns
liar_plus_train['StatementTokenized'] = liar_plus_train['Statement'].apply(word_tokenize)

#creating word embedding models
StatementTokenizedModel = Word2Vec(liar_plus_train['StatementTokenized'], vector_size=35, window=4, min_count=1, workers=4)
liar_plus_train['embeddings_headline'] = liar_plus_train['StatementTokenized'].apply(lambda x: get_word_embeddings(x, StatementTokenizedModel))

In [13]:
X = liar_plus_train[['embeddings_headline', 'bi-gram-embeddings-statement','quad-gram-embeddings-statement',
                      'DominantTopicBody', 'TopicProbabilityBody', 
                      'Truth_Label', 'Statement']]#, 'subject1text_embeddings']]

# Flatten embedding columns
X['bi-gram-embeddings-statement'] = X['bi-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-statement'] = X['quad-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
X['bi-gram-embeddings-statement'] = X['bi-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-statement'] = X['quad-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
#X['subject1text_embeddings'] = X['subject1text_embeddings'].apply(lambda x: x[0] if len(x) > 0 else x)

#isolating n-gram values
for i in range(len(X['bi-gram-embeddings-statement'][0])):
    column_name = f'bi-gram-value-{i+1}' 
    X.loc[:, column_name] = X['bi-gram-embeddings-statement'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['bi-gram-embeddings-statement'])

for i in range(len(X['quad-gram-embeddings-statement'][0])):
    column_name = f'quad-gram-value-{i+1}' 
    X.loc[:, column_name] = X['quad-gram-embeddings-statement'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['quad-gram-embeddings-statement'])

for i in range(len(X['embeddings_headline'][0])):
    column_name = f'embeddings-headline-value-{i+1}' 
    X.loc[:, column_name] = X['embeddings_headline'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['embeddings_headline'])

# for i in range(len(X['subject1text_embeddings'][10296])):
#     column_name = f'subject1embedding-{i+1}' 
#     X.loc[:, column_name] = X['subject1text_embeddings'].apply(lambda x: x[i] if i < len(x) else None)
# X = X.drop(columns=['subject1text_embeddings'])

#set up for training
X = X.dropna()
y = X['Truth_Label']
X = X.drop(columns=['Truth_Label'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_Stance = X_train.drop(columns = ['Statement'])
X_test_Stance = X_test.drop(columns = ['Statement'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['bi-gram-embeddings-statement'] = X['bi-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['quad-gram-embeddings-statement'] = X['quad-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

In [14]:
names = ["Random Forest"]
classifiers = [RandomForestClassifier(n_estimators=150, max_features=338)]
max_score = 0.0
max_class = ''
# iterate over classifiers
for name, clf in zip(names, classifiers):
    start_time = time.time()
    clf.fit(X_train_Stance, y_train)
    score = 100.0 * clf.score(X_test_Stance, y_test)
    probabilities = clf.predict_proba(X_test_Stance)
    print('Classifier = %s, Score (test, accuracy) = %.2f,' %(name, score), 'Training time = %.2f seconds' % (time.time() - start_time))
    
    if score > max_score:
        clf_best = clf
        max_score = score
        max_class = name
        best_probabilities = probabilities

print(80*'-' )
print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %(max_class, max_score))
#plot the output of the various algorithms
stance_probabilities = best_probabilities

Classifier = Random Forest, Score (test, accuracy) = 23.18, Training time = 114.91 seconds
--------------------------------------------------------------------------------
Best --> Classifier = Random Forest, Score (test, accuracy) = 23.18


In [15]:
# List of class labels
class_labels = ['barely-true', 'false', 'half-true', 'mostly-true', 'pants-fire', 'true']
predictions = []
# Iterate through each row and find the index of the maximum value
for i, row in enumerate(stance_probabilities):
    max_index = np.argmax(row)
    max_class = class_labels[max_index]
    predictions.append(max_class)

report = classification_report(y_test, predictions)
matrix = confusion_matrix(y_test, predictions)

In [16]:
print(report)

              precision    recall  f1-score   support

 barely-true       0.22      0.25      0.23       293
       false       0.24      0.27      0.26       393
   half-true       0.24      0.27      0.25       446
 mostly-true       0.24      0.31      0.27       378
  pants-fire       0.11      0.03      0.04       155
        true       0.23      0.13      0.16       371

    accuracy                           0.23      2036
   macro avg       0.21      0.21      0.20      2036
weighted avg       0.22      0.23      0.22      2036



Achieved a test score of 23.18% using n-grams and LDA on only the statement provided

## Using Previous Reputation of Speaker

In [17]:
with_ratings = pd.read_csv('Data/politifact_data_combined_prev_ratings.csv')

In [18]:
with_ratings.head()

Unnamed: 0,media,when/where,content,label,speaker,documented_time,percentages,check_nums,summaries,article
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",“Haaretz investigation reveals discrepancies i...,FALSE,Madison Czopek,"October 31, 2023",['0%' '0%' '2%' '7%' '67%' '21%'],[ 5 3 16 54 473 152],"['Haaretz, an Israeli newspaper, said on X tha...",A viral Oct. 28 social media post claimed that...
1,Scott Walker,"stated on May 30, 2023 in Interview:",“Wisconsin has historically … and I think larg...,barely-true,Laura Schulte,"October 31, 2023",['12%' '21%' '18%' '19%' '21%' '5%'],[26 45 39 41 44 11],['Although Wisconsin has voted for more Democr...,"In 2016, Wisconsin helped to swing the preside..."
2,Instagram posts,"stated on October 27, 2023 in a post:","“The airport in Salzburg, Austria, has a count...",FALSE,Ciara O'Rourke,"October 30, 2023",['0%' '0%' '2%' '7%' '67%' '21%'],[ 5 3 16 54 473 152],[],A social media post poised to encourage people...
3,Viral image,"stated on October 27, 2023 in an Instagram post:",Video shows Palestinians pretending to be corp...,FALSE,Ciara O'Rourke,"October 30, 2023",['0%' '1%' '2%' '4%' '62%' '28%'],[ 4 13 35 53 745 336],['This video is 10 years old and shows student...,The Gaza Health Ministry has said the Palestin...
4,Facebook posts,"stated on September 25, 2023 in a Facebook post:",The life span of a wind tower generator lasts ...,FALSE,Loreben Tuquero,"October 30, 2023",['0%' '1%' '4%' '9%' '59%' '23%'],[ 24 50 108 247 1519 594],['A study by energy industry experts showed th...,Let’s clear the air. Do wind turbine component...


In [19]:
def extract_ratings(row):
    ratings = [int(x) for x in row['check_nums'].strip('[]').split()]
    return {
        'TRUE_counts': ratings[0],
        'mostly-true_counts': ratings[1],
        'half-true_counts': ratings[2],
        'barely-true_counts': ratings[3],
        'FALSE_counts': ratings[4],
        'pants-fire_counts': ratings[5]
    }

In [20]:
new_columns = with_ratings.apply(extract_ratings, axis=1, result_type='expand')
with_ratings = pd.concat([with_ratings, new_columns], axis=1)
with_ratings = with_ratings.drop('check_nums', axis=1)

In [21]:
def determine_label(row):
    max_column = row[['TRUE_counts', 'mostly-true_counts', 'half-true_counts',
                      'barely-true_counts', 'FALSE_counts', 'pants-fire_counts']].idxmax()
    return max_column.replace('_counts', '')

In [22]:
with_ratings['prediction'] = with_ratings.apply(determine_label, axis=1)

In [23]:
with_ratings.drop(with_ratings[with_ratings['label'] == 'full-flop'].index, inplace=True)
with_ratings.drop(with_ratings[with_ratings['label'] == 'half-flip'].index, inplace=True)
with_ratings.drop(with_ratings[with_ratings['label'] == 'no-flip'].index, inplace=True)

In [24]:
(with_ratings['label'] == with_ratings['prediction']).sum() / with_ratings.shape[0]

0.5102807433768288

In [25]:
report = classification_report(with_ratings['label'], with_ratings['prediction'])
print(report)

              precision    recall  f1-score   support

       FALSE       0.57      0.68      0.62      7285
        TRUE       0.50      0.49      0.49      2911
 barely-true       0.49      0.36      0.42      3961
   half-true       0.46      0.43      0.45      4124
 mostly-true       0.42      0.59      0.49      3772
  pants-fire       0.65      0.32      0.43      3237

    accuracy                           0.51     25290
   macro avg       0.52      0.48      0.48     25290
weighted avg       0.52      0.51      0.50     25290



51% accuracy when just taking most common rating from previous statements

In [26]:
def predict_label(confidence):
    if confidence < 0.166:
        return "pants-fire"
    elif confidence < 0.33:
        return "FALSE"
    elif confidence < 0.5:
        return "barely-true"
    elif confidence < 0.666:
        return "half-true"
    elif confidence < 0.833:
        return "mostly-true" 
    else:
        return "TRUE"

In [27]:
def calculate_confidence(row):
    weights = [1, 0.8, 0.6, 0.4, 0.2, 0]
    total_ratings = row['TRUE_counts'] + row['mostly-true_counts'] + row['half-true_counts'] + row['barely-true_counts'] + row['FALSE_counts'] + row['pants-fire_counts']

    try:
        confidence = sum(row[col] * weight for col, weight in zip(['TRUE_counts', 'mostly-true_counts', 'half-true_counts', 
                                                                   'barely-true_counts', 'FALSE_counts', 'pants-fire_counts'], weights))
        confidence /= total_ratings
        return predict_label(confidence)
    except (ValueError, TypeError, ZeroDivisionError):
        return 'error'

In [28]:
with_ratings['predicted_label_weighted'] = with_ratings.apply(calculate_confidence, axis=1)

In [29]:
(with_ratings['label'] == with_ratings['predicted_label_weighted']).sum() / with_ratings.shape[0]

0.44183471727955714

In [30]:
report = classification_report(with_ratings['label'], with_ratings['predicted_label_weighted'])
print(report)

              precision    recall  f1-score   support

       FALSE       0.63      0.56      0.60      7285
        TRUE       0.83      0.23      0.37      2911
 barely-true       0.32      0.37      0.34      3961
   half-true       0.29      0.60      0.39      4124
 mostly-true       0.44      0.36      0.39      3772
  pants-fire       0.66      0.33      0.44      3237

    accuracy                           0.44     25290
   macro avg       0.53      0.41      0.42     25290
weighted avg       0.52      0.44      0.45     25290



By weighting each percentage value respectively, so that not only the maximum value decided the truth label, we come out to a 44% accuracy.

## Using Perigon News API to Enrich Data

In [31]:
kw_model = KeyBERT()
column_name = 'Truth_Label'
num_samples_per_value = 55

In [None]:
#COMMENT THE FOLLOWING LINE BACK IN IF YOU HAVE YOUR OWN ACCESS TO PERIGON NEWS API REQUESTS
#THIS WILL ALLOW YOU TO CREATE YOUR OWN SUBSET OF THE DATA TO TRAIN AND TEST ON

#equal_random_sample = liar_plus_train.groupby(column_name, group_keys=False).apply(lambda x: x.sample(min(len(x), num_samples_per_value)))

In [32]:
## RUN THIS CELL TO GET DATA I USED FOR PERIGON NEWS API REQUESTS
equal_random_sample = pd.read_csv('Data/equal_random_sample_df.csv')
equal_random_sample = equal_random_sample.drop(columns=['Unnamed: 0'])
equal_random_sample.head(5)

Unnamed: 0,Json_File_ID,Truth_Label,Statement,Subject,Speaker,Speakers_Job,State,Party,Barely_True_Counts,False_Counts,...,Mostly_True_Counts,Pants_On_Fire_Counts,Context_Venue_Location,Justification,bi-gram-embeddings-statement,quad-gram-embeddings-statement,DominantTopicBody,TopicProbabilityBody,StatementTokenized,embeddings_headline
0,2251.json,barely-true,joe sestak even wants to bring back the death...,"message-machine,taxes",pat-toomey,Candidate for U.S. Senate,Pennsylvania,republican,3.0,2.0,...,1.0,0.0,a television commercial,"""Bridenstine said Obama """"spends 30 times as m...","[array([[-1.32675737e-01, 4.76077348e-01, 1....","[array([[-1.32675737e-01, 4.76077348e-01, 1....",8,0.2576,"['joe', 'sestak', 'even', 'wants', 'to', 'brin...",[ 0.11599965 -0.6195692 -1.1695052 0.296791...
1,1785.json,barely-true,sen robert bennett rutah cast votes for tarp t...,"economy,financial-regulation,health-care,pundits",george-will,Columnist,Maryland,columnist,7.0,6.0,...,5.0,1.0,a roundtable discussion on ABC's This Week,"""In some instances, it may come down to the di...","[array([[-0.09808729, 0.7075012 , 0.2448038 ...","[array([[-6.39656931e-02, 2.05633000e-01, 3....",5,0.447285,"['sen', 'robert', 'bennett', 'rutah', 'cast', ...",[-0.09433161 -0.6782185 -1.1311711 0.225834...
2,11975.json,barely-true,the every student succeeds act did away basica...,education,john-mccain,U.S. senator,Arizona,republican,31.0,39.0,...,37.0,8.0,a newspaper interview,"We could downgrade Merkley for his statement, ...","[array([[-5.46731353e-01, 6.02599502e-01, -3....","[array([[ 2.41033826e-02, 8.61556709e-01, 1....",15,0.218869,"['the', 'every', 'student', 'succeeds', 'act',...",[ 8.9300625e-02 -4.8466632e-01 -1.0525886e+00 ...
3,36.json,barely-true,sen clinton said the surge of troops in iraq w...,iraq,john-mccain,U.S. senator,Arizona,republican,31.0,39.0,...,37.0,8.0,a news release,"""Cruz said Trump has """"described Hillary Clint...","[array([[-9.80872884e-02, 7.07501173e-01, 2....","[array([[-9.80872884e-02, 7.07501173e-01, 2....",12,0.391637,"['sen', 'clinton', 'said', 'the', 'surge', 'of...",[ 0.02118887 -0.5304897 -1.2749678 0.133021...
4,10636.json,barely-true,according to the state of florida you are almo...,"animals,corrections-and-updates,crime,guns",florida-students-concealed-carry,,Florida,organization,1.0,0.0,...,0.0,0.0,Florida Senate higher education committee hearing,The GDP is considered a more accurate way to m...,"[array([[-1.01865396e-01, 3.75432402e-01, 4....","[array([[-1.01865396e-01, 3.75432402e-01, 4....",23,0.331951,"['according', 'to', 'the', 'state', 'of', 'flo...",[ 0.21315888 -0.613976 -1.4231781 0.341717...


### ONLY RUN THE NEXT 7 CELLS IF YOU HAVE YOUR OWN PERIGON API KEY AND RESOURCES

In [63]:
#ONLY RUN THIS CELL AND NEXT 6 CELLS IF YOU HAVE YOUR OWN PERIGON API KEY AND RESOURCES
for index, row in equal_random_sample.iterrows():
    statement = row['Statement']

    keywords = kw_model.extract_keywords(statement, top_n=3)
    doc_keywords = ' '.join([keyword for keyword, score in keywords if score > 0.01])

    try:
        tokenized_words = word_tokenize(doc_keywords)
        keyword_1 = tokenized_words[0]
        keyword_2 = tokenized_words[1]
        keyword_3 = tokenized_words[2]
    except IndexError:
        pass

    API_KEY = "89a2692d-ba03-4aee-9a16-ea725f927467"
    url = f"https://api.goperigon.com/v1/headlines?q={keyword_1} AND {keyword_2} AND {keyword_3}&from=2022-01-01&apiKey={API_KEY}"

    resp = requests.get(url)
    cluster_count = 0

    for cluster in resp.json().get('clusters', []):
        if cluster_count >= 3:
            break  # Stop iterating if we've reached the limit
        cluster_count += 1

        hit_count = 0
        for hit in cluster.get('hits', []):
            content_url = hit.get('url', '')
            content_content = hit.get('content', 'content')
            content_author = hit.get('authorsByline', '')
            content_source = hit.get('source', {}).get('domain', '')
            content_summary = hit.get('summary', '')

            equal_random_sample.at[index, f'content_url_{cluster_count}'] = content_url
            equal_random_sample.at[index, f'content_content_{cluster_count}'] = content_content
            equal_random_sample.at[index, f'content_author_{cluster_count}'] = content_author
            equal_random_sample.at[index, f'content_source_{cluster_count}'] = content_source
            equal_random_sample.at[index, f'content_summary_{cluster_count}'] = content_summary


In [10]:
equal_random_sample.fillna('', inplace=True)
shuffled_df = equal_random_sample.sample(frac=1)
shuffled_df.reset_index(drop=True, inplace=True)
equal_random_sample = shuffled_df

In [11]:
#creating ngram model for article headline
equal_random_sample['Statement'] = equal_random_sample['Statement'].apply(preprocess_text)
headline_split = [text.split() for text in equal_random_sample['Statement']] 
model_headline_news = Word2Vec(headline_split, vector_size=100, window=6, min_count=2)
model_headline_news.save("word2vec.model_headline_news")
equal_random_sample['bi-gram-embeddings-statement'] = equal_random_sample['Statement'].apply(lambda x: get_ngram_embeddings(x, 2, model_headline_news))
equal_random_sample['quad-gram-embeddings-statement'] = equal_random_sample['Statement'].apply(lambda x: get_ngram_embeddings(x, 4, model_headline_news))
#creating ngram model for similar article 1 content
equal_random_sample['content_content_1'] = equal_random_sample['content_content_1'].apply(preprocess_text)
headline_split = [text.split() for text in equal_random_sample['content_content_1']] 
model_content_1 = Word2Vec(headline_split, vector_size=100, window=6, min_count=2)
model_content_1.save("word2vec.model_content_1")
equal_random_sample['bi-gram-embeddings-content-1'] = equal_random_sample['content_content_1'].apply(lambda x: get_ngram_embeddings(x, 2, model_content_1))
equal_random_sample['quad-gram-embeddings-content-1'] = equal_random_sample['content_content_1'].apply(lambda x: get_ngram_embeddings(x, 4, model_content_1))
#creating ngram model for similar article 2 content
equal_random_sample['content_content_2'] = equal_random_sample['content_content_2'].apply(preprocess_text)
headline_split = [text.split() for text in equal_random_sample['content_content_2']] 
model_content_2 = Word2Vec(headline_split, vector_size=100, window=6, min_count=2)
model_content_2.save("word2vec.model_content_2")
equal_random_sample['bi-gram-embeddings-content-2'] = equal_random_sample['content_content_2'].apply(lambda x: get_ngram_embeddings(x, 2, model_content_2))
equal_random_sample['quad-gram-embeddings-content-2'] = equal_random_sample['content_content_2'].apply(lambda x: get_ngram_embeddings(x, 4, model_content_2))
#creating ngram model for similar article 3 content
equal_random_sample['content_content_3'] = equal_random_sample['content_content_3'].apply(preprocess_text)
headline_split = [text.split() for text in equal_random_sample['content_content_3']] 
model_content_3 = Word2Vec(headline_split, vector_size=100, window=6, min_count=2)
model_content_3.save("word2vec.model_content_3")
equal_random_sample['bi-gram-embeddings-content-3'] = equal_random_sample['content_content_3'].apply(lambda x: get_ngram_embeddings(x, 2, model_content_3))
equal_random_sample['quad-gram-embeddings-content-3'] = equal_random_sample['content_content_3'].apply(lambda x: get_ngram_embeddings(x, 4, model_content_3))

KeyError: 'content_content_1'

In [None]:
#LDA
documents = equal_random_sample['Statement'].apply(preprocess_text)
texts = [document.split() for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model_body = models.LdaModel(corpus, num_topics=100, id2word=dictionary, passes=15)

topic_distributions = [lda_model_body[doc] for doc in corpus]
dominant_topics = []

for topic_dist in topic_distributions:
# Sort topics by probability and get the most likely topic
    dominant_topic = max(topic_dist, key=lambda item: item[1])
    dominant_topics.append(dominant_topic)

# Extract the topic number (0-based index) and probability from the dominant topic
topic_numbers = [topic[0] for topic in dominant_topics]
topic_probabilities = [topic[1] for topic in dominant_topics]

# Add the topic information as new columns in your DataFrame
equal_random_sample['DominantTopicBody'] = topic_numbers
equal_random_sample['TopicProbabilityBody'] = topic_probabilities

In [67]:
#tokenizing text columns
equal_random_sample['StatementTokenized'] = equal_random_sample['Statement'].apply(word_tokenize)
equal_random_sample['Content1Tokenized'] = equal_random_sample['content_content_1'].apply(word_tokenize)
equal_random_sample['Content2Tokenized'] = equal_random_sample['content_content_2'].apply(word_tokenize)
equal_random_sample['Content3Tokenized'] = equal_random_sample['content_content_3'].apply(word_tokenize)

#creating word embedding models
#headline
StatementTokenizedModel = Word2Vec(equal_random_sample['StatementTokenized'], vector_size=35, window=4, min_count=1, workers=4)
equal_random_sample['embeddings_headline'] = equal_random_sample['StatementTokenized'].apply(lambda x: get_word_embeddings(x, StatementTokenizedModel))
#content 1
Content1TokenizedModel = Word2Vec(equal_random_sample['Content1Tokenized'], vector_size=35, window=4, min_count=1, workers=4)
equal_random_sample['embeddings_content1'] = equal_random_sample['Content1Tokenized'].apply(lambda x: get_word_embeddings(x, Content1TokenizedModel))
#content 2
Content2TokenizedModel = Word2Vec(equal_random_sample['Content2Tokenized'], vector_size=35, window=4, min_count=1, workers=4)
equal_random_sample['embeddings_content2'] = equal_random_sample['Content2Tokenized'].apply(lambda x: get_word_embeddings(x, Content2TokenizedModel))
#content 3
Content3TokenizedModel = Word2Vec(equal_random_sample['Content3Tokenized'], vector_size=35, window=4, min_count=1, workers=4)
equal_random_sample['embeddings_content3'] = equal_random_sample['Content3Tokenized'].apply(lambda x: get_word_embeddings(x, Content3TokenizedModel))

In [12]:
def check_list_lengths(row):
    for column in X.columns[:-2]: 
        if len(row[column]) <= 1:
            return False
    return True

In [79]:
X = equal_random_sample[['embeddings_headline', 'bi-gram-embeddings-statement','quad-gram-embeddings-statement',
                       'bi-gram-embeddings-content-1', 'bi-gram-embeddings-content-2', 'bi-gram-embeddings-content-3',
                        'quad-gram-embeddings-content-1', 'quad-gram-embeddings-content-2', 'quad-gram-embeddings-content-3', 
                      'Truth_Label', 'Statement', 'DominantTopicBody', 'TopicProbabilityBody']]
X_copy = X

# Flatten embedding columns
X['bi-gram-embeddings-statement'] = X['bi-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-statement'] = X['quad-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
X['bi-gram-embeddings-statement'] = X['bi-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-statement'] = X['quad-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)

X['bi-gram-embeddings-content-1'] = X['bi-gram-embeddings-content-1'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-content-1'] = X['quad-gram-embeddings-content-1'].apply(lambda x: x[0] if len(x) > 0 else x)
X['bi-gram-embeddings-content-1'] = X['bi-gram-embeddings-content-1'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-content-1'] = X['quad-gram-embeddings-content-1'].apply(lambda x: x[0] if len(x) > 0 else x)

X['bi-gram-embeddings-content-2'] = X['bi-gram-embeddings-content-2'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-content-2'] = X['quad-gram-embeddings-content-2'].apply(lambda x: x[0] if len(x) > 0 else x)
X['bi-gram-embeddings-content-2'] = X['bi-gram-embeddings-content-2'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-content-2'] = X['quad-gram-embeddings-content-2'].apply(lambda x: x[0] if len(x) > 0 else x)

X['bi-gram-embeddings-content-3'] = X['bi-gram-embeddings-content-3'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-content-3'] = X['quad-gram-embeddings-content-3'].apply(lambda x: x[0] if len(x) > 0 else x)
X['bi-gram-embeddings-content-3'] = X['bi-gram-embeddings-content-3'].apply(lambda x: x[0] if len(x) > 0 else x)
X['quad-gram-embeddings-content-3'] = X['quad-gram-embeddings-content-3'].apply(lambda x: x[0] if len(x) > 0 else x)

indices = X_copy[X_copy.apply(check_list_lengths, axis=1)].index

#isolating n-gram values
for i in range(len(X['bi-gram-embeddings-statement'].loc[indices[-1]])):
    column_name = f'bi-gram-value-{i+1}' 
    X.loc[:, column_name] = X['bi-gram-embeddings-statement'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['bi-gram-embeddings-statement'])

for i in range(len(X['quad-gram-embeddings-statement'].loc[indices[-1]])):
    column_name = f'quad-gram-value-{i+1}' 
    X.loc[:, column_name] = X['quad-gram-embeddings-statement'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['quad-gram-embeddings-statement'])

for i in range(len(X['embeddings_headline'].loc[indices[-1]])):
    column_name = f'embeddings-headline-value-{i+1}' 
    X.loc[:, column_name] = X['embeddings_headline'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['embeddings_headline'])

for i in range(len(X['bi-gram-embeddings-content-1'].loc[indices[-1]])):
    column_name = f'bi-gram-content1-{i+1}' 
    X.loc[:, column_name] = X['bi-gram-embeddings-content-1'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['bi-gram-embeddings-content-1'])

for i in range(len(X['quad-gram-embeddings-content-1'].loc[indices[-1]])):
    column_name = f'quad-gram-content1-{i+1}' 
    X.loc[:, column_name] = X['quad-gram-embeddings-content-1'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['quad-gram-embeddings-content-1'])

for i in range(len(X['bi-gram-embeddings-content-2'].loc[indices[-1]])):
    column_name = f'bi-gram-content2-{i+1}' 
    X.loc[:, column_name] = X['bi-gram-embeddings-content-2'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['bi-gram-embeddings-content-2'])

for i in range(len(X['quad-gram-embeddings-content-2'].loc[indices[-1]])):
    column_name = f'quad-gram-content2-{i+1}' 
    X.loc[:, column_name] = X['quad-gram-embeddings-content-2'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['quad-gram-embeddings-content-2'])

for i in range(len(X['bi-gram-embeddings-content-3'].loc[indices[-1]])):
    column_name = f'bi-gram-content3-{i+1}' 
    X.loc[:, column_name] = X['bi-gram-embeddings-content-3'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['bi-gram-embeddings-content-3'])

for i in range(len(X['quad-gram-embeddings-content-3'].loc[indices[-1]])):
    column_name = f'quad-gram-content3-{i+1}' 
    X.loc[:, column_name] = X['quad-gram-embeddings-content-3'].apply(lambda x: x[i] if i < len(x) else None)
X = X.drop(columns=['quad-gram-embeddings-content-3'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['bi-gram-embeddings-statement'] = X['bi-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['quad-gram-embeddings-statement'] = X['quad-gram-embeddings-statement'].apply(lambda x: x[0] if len(x) > 0 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

### START AGAIN AT THIS CELL IF NOT USING YOUR OWN PERIGON API DATA

In [39]:
#START AGAIN AT THIS CELL IF NOT USING YOUR OWN PERIGON API DATA
X = pd.read_csv('Data/equal_random_sample_processed.csv')
shuffled_df = X.sample(frac=1)
shuffled_df.reset_index(drop=True, inplace=True)
X = shuffled_df

In [40]:
#set up for training
X.fillna(0, inplace=True)
y = X['Truth_Label']
X = X.drop(columns=['Truth_Label', 'Unnamed: 0'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_Stance = X_train.drop(columns = ['Statement'])
X_test_Stance = X_test.drop(columns = ['Statement'])

In [44]:
names = ["Random Forest"]
classifiers = [RandomForestClassifier(n_estimators=100, max_features=837, max_depth=5)]
max_score = 0.0
max_class = ''
# iterate over classifiers
for name, clf in zip(names, classifiers):
    start_time = time.time()
    clf.fit(X_train_Stance, y_train)
    score = 100.0 * clf.score(X_test_Stance, y_test)
    probabilities = clf.predict_proba(X_test_Stance)
    print('Classifier = %s, Score (test, accuracy) = %.2f,' %(name, score), 'Training time = %.2f seconds' % (time.time() - start_time))
    
    if score > max_score:
        clf_best = clf
        max_score = score
        max_class = name
        best_probabilities = probabilities

print(80*'-' )
print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %(max_class, max_score))
#plot the output of the various algorithms

Classifier = Random Forest, Score (test, accuracy) = 21.21, Training time = 2.56 seconds
--------------------------------------------------------------------------------
Best --> Classifier = Random Forest, Score (test, accuracy) = 21.21


In [45]:
# List of class labels
class_labels = ['barely-true', 'false', 'half-true', 'mostly-true', 'pants-fire', 'true']
predictions = []
# Iterate through each row and find the index of the maximum value
for i, row in enumerate(best_probabilities):
    max_index = np.argmax(row)
    max_class = class_labels[max_index]
    predictions.append(max_class)

In [46]:
report = classification_report(y_test, predictions)
matrix = confusion_matrix(y_test, predictions)

In [47]:
print(report)

              precision    recall  f1-score   support

 barely-true       0.50      0.27      0.35        15
       false       0.00      0.00      0.00         8
   half-true       0.11      0.22      0.14         9
 mostly-true       0.18      0.17      0.17        12
  pants-fire       0.25      0.27      0.26        11
        true       0.30      0.27      0.29        11

    accuracy                           0.21        66
   macro avg       0.22      0.20      0.20        66
weighted avg       0.25      0.21      0.22        66



In [48]:
matrix

array([[4, 4, 3, 1, 1, 2],
       [0, 0, 1, 3, 2, 2],
       [3, 0, 2, 1, 2, 1],
       [0, 1, 6, 2, 2, 1],
       [0, 1, 3, 3, 3, 1],
       [1, 0, 4, 1, 2, 3]])