In [1]:
import pandas as pd
from helpers import *
from sklearn.ensemble import RandomForestClassifier



In [2]:
# Loading in data and cleaning up unnecessary columns.
truth_seeker = pd.read_csv("TruthSeeker2023/Features_For_Traditional_ML_Techniques.csv")
truth_seeker = truth_seeker.drop(columns=["Unnamed: 0"])

In [10]:
# Extracting all features which are derived based on the content.
truth_seeker_content = truth_seeker[["unique_count", "present_verbs", "total_count", "past_verbs", "ORG_percentage", "adjectives", 
                                     "NORP_percentage", "pronouns", "GPE_percentage", "TOs", "PERSON_percentage", "determiners", 
                                     "MONEY_percentage", "conjunctions", "DATE_percentage", "dots", "CARDINAL_percentage", "exclamation", 
                                     "PERCENT_percentage", "questions", "ORDINAL_percentage", "ampersand", "FAC_percentage", "capitals", 
                                     "LAW_percentage", "quotes", "PRODUCT_percentage", "digits", "EVENT_percentage", "long_word_freq", 
                                     "TIME_percentage", "short_word_freq", "LOC_percentage", "ORG_percentage", "WORK_OF_ART_percentage", 
                                     "QUANTITY_percentage", "LANGUAGE_percentage", "Max word length", "Min word length", "Average word length"]]

# Extracting all features which are derived based on the user.
truth_seeker_user = truth_seeker[['followers_count', 'friends_count', 'favourites_count', 'statuses_count', 'listed_count', 
                                  'following', 'mentions', 'quotes','replies', 'retweets', 'favourites', 'hashtags', 'URLs']]

# Extracting all features which are used as metrics of credibility.
truth_seeker_cred = truth_seeker[['BotScore', 'BotScoreBinary', 'cred', 'normalize_influence']]

# Extracting all features which are derived based on lexical structure in the tweet.
truth_seeker_lexical = truth_seeker[['present_verbs', 'past_verbs', 'adjectives', 'adverbs', 'adpositions', 'pronouns', 'TOs', 
                                     'determiners', 'conjunctions', 'dots', 'exclamation', 'questions', 'ampersand', 'capitals', 
                                     'digits', 'long_word_freq', 'short_word_freq']]

# Extracting all features which are derived based on spaCy tags.
truth_seeker_spaCy = truth_seeker[["ORG_percentage", "NORP_percentage", "GPE_percentage", "PERSON_percentage", "MONEY_percentage", "DATE_percentage", 
                                   "CARDINAL_percentage", "PERCENT_percentage", "ORDINAL_percentage", "FAC_percentage", "LAW_percentage", 
                                   "PRODUCT_percentage", "EVENT_percentage", "TIME_percentage", "LOC_percentage", "ORG_percentage", 
                                   "WORK_OF_ART_percentage", "QUANTITY_percentage", "LANGUAGE_percentage"]]

# Extracting all features.
truth_seeker_features = truth_seeker.drop(columns=["majority_target", "statement", "BinaryNumTarget", "tweet", "embeddings"])

# Extracting label column for data.
truth_seeker_output = truth_seeker["BinaryNumTarget"]

# Creating sklearn model instance.
rf_model = RandomForestClassifier()


In [4]:
# Testing model with all features.
print("Metrics when using all features:")
train_and_evaluate_data(truth_seeker_features, truth_seeker_output, rf_model)

Metrics when using all features:
Model Accuracy: 0.6913263785394933
Precision: 0.7079397964779721
Recall: 0.6762120947558349
F1 Score: 0.6917123124553465


In [5]:
# Testing model with spaCy features.
print("Metrics when using spaCy features:")
train_and_evaluate_data(truth_seeker_spaCy, truth_seeker_output, rf_model)

Metrics when using spaCy features:
Model Accuracy: 0.6048286140089418
Precision: 0.6239964599532208
Recall: 0.5745300040742681
F1 Score: 0.5982424242424241


In [6]:
# Testing model with lexical features.
print("Metrics when using lexical features:")
train_and_evaluate_data(truth_seeker_lexical, truth_seeker_output, rf_model)

Metrics when using lexical features:
Model Accuracy: 0.6312965722801789
Precision: 0.638957888048062
Recall: 0.64379256155055
F1 Score: 0.6413661138814798


In [7]:
# Testing model with credibility features.
print("Metrics when using credibility features:")
train_and_evaluate_data(truth_seeker_cred, truth_seeker_output, rf_model)

Metrics when using credibility features:
Model Accuracy: 0.620715350223547
Precision: 0.6289351851851852
Recall: 0.6325592223968337
F1 Score: 0.6307419982008647


In [8]:
# Testing model with user-based features.
print("Metrics when using user-based features:")
train_and_evaluate_data(truth_seeker_user, truth_seeker_output, rf_model)

Metrics when using user-based features:


Model Accuracy: 0.6427719821162444
Precision: 0.6606281686657599
Recall: 0.6219079215412374
F1 Score: 0.6406835556888023


In [9]:
# Testing model with content-based features.
print("Metrics when using content-based features:")
train_and_evaluate_data(truth_seeker_content, truth_seeker_output, rf_model)

Metrics when using content-based features:
Model Accuracy: 0.6825931445603577
Precision: 0.6920272812793979
Recall: 0.6850590768872592
F1 Score: 0.6885255491532365
