In [None]:
#!pip install pickle5
#!pip install --upgrade pandas

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


# Import Libraries & Data

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
import html
import re
import string
import spacy
import math

import wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import os

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#print(pd.__version__)

In [2]:
df = pd.read_pickle("../data/cleaned_lemmatized.pkl")

In [3]:
df

Unnamed: 0,id,text,target
0,1010598407596072962,icymi learn mandate rooftop solar power cause ...,0
1,1035418425047298049,nationalhurricanecenter hype storm convince cl...,0
2,968236700794175488,hey theresistance altleft antifa waronwomen fe...,0
3,968239413770760194,de jaren ge het globaal koeling compilation ne...,0
4,1067506402543964161,lovely little skybastard idonotconsent wedonot...,0
...,...,...,...
181013,1034131779647467520,mayor yes zev bus city bus clean quieter help ...,1
181014,959464906121777154,late william freimuth daily thank climatechang...,1
181015,952744691035914240,finally happen year ago lnp obstruction denial...,1
181016,1044531203154616320,present goal objective state play partner year...,1


In [13]:
# read df
#with open('/content/drive/MyDrive/Thesis/training_dataset_undersampled', 'rb') as pickle_in:
#    data = pickle.load(pickle_in)
#    pickle_in.close()

In [4]:
X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=100)

In [5]:
# make stopwords list

definitive_hashtags = ['climatechangeisreal', 'actonclimate', 'extinctionrebellion', 'climateemergency', 
                 'climateactionnow', 'capitalism', 'public_health', 'climateaction', 'humanityextinction',
                 'activism', 'noplanetb', 'savetheplanet', 'climateaction','climatechangeisfalse', 
                 'climatechangenotreal', 'climatechangehoax','globalwarminghoax', 'tcot', 'ccot', 'tlot', 
                 'pjnet', 'rednationrising', 'votered','libtard', 'libtards', 'maga', 'climatedeniers', 
                 'climatehoax', 'globalcooling','climatechangescam', 'climatehysteria', 'globalwarmingisahoax', 
                 'globalwarmingscam', 'globalcooling']

stop_words = set(stopwords.words("english"))
stopwords_all = stop_words.union(definitive_hashtags)

## Baseline Model (81.5% Accuracy)

In [7]:
from sklearn.naive_bayes import MultinomialNB

In [8]:
# create a count vectorizer
vect = CountVectorizer(max_features=1000,stop_words=stopwords_all)

# vectorize train and test sets
xtrain_count = vect.fit_transform(X_train)
xtest_count = vect.transform(X_test)

# fit the training dataset on the NB classifier
baseline = MultinomialNB()
baseline.fit(xtrain_count, y_train)

# predict the labels on test set and get evaluation scores
baseline_pred = baseline.predict(xtest_count)

accuracy_baseline = round(100*accuracy_score(y_test, baseline_pred),3)
precision_baseline = round(100*precision_score(y_test, baseline_pred),3)
recall_baseline = round(100*recall_score(y_test, baseline_pred),3)
f1_baseline = round(2 * (precision_baseline * recall_baseline) / 
                    (precision_baseline + recall_baseline),3)

# print evaluation metrics
print('Accuracy:', accuracy_baseline)
print('Precision:',precision_baseline)
print('Recall:',recall_baseline)
print('F1:',f1_baseline)

Accuracy: 81.455
Precision: 82.763
Recall: 86.486
F1: 84.584


In [11]:
confusion_matrix(y_test, baseline_pred, labels = [1,0])

array([[18418,  2878],
       [ 3836, 11072]])

In [12]:
filename = 'BaselineNB_81.5%'
pickle.dump(baseline, open(filename, 'wb'))

## Logistic Regression + TF-IDF

In [57]:
# TF-IDF ngram range = 1,3 (Accuracy 88.7%)

vect = TfidfVectorizer(ngram_range=(1,3), max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000)

In [58]:
X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

precision = round(100*precision_score(y_test, y_pred),3)
recall = round(100*recall_score(y_test, y_pred),3)
f1 = round(2 * (precision * recall) / (precision + recall),3)
accuracy = round(100*accuracy_score(y_test, y_pred),3)

print('Accuracy:', accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('F1:',f1)

Accuracy: 88.606
Precision: 88.478
Recall: 92.703
F1: 90.541


In [18]:
# TF-IDF ngram range = 2,3 (Accuracy 80.3%)

vect = TfidfVectorizer(ngram_range=(2,3), max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000)

In [19]:
X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

precision = round(100*precision_score(y_test, y_pred),3)
recall = round(100*recall_score(y_test, y_pred),3)
f1 = round(2 * (precision * recall) / (precision + recall),3)
accuracy = round(100*accuracy_score(y_test, y_pred),3)

print('Accuracy:', accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('F1:',f1)

Accuracy: 80.323
Precision: 77.984
Recall: 92.726
F1: 84.718


## Logistic Regression + CountVectorizer

In [13]:
# Count Vectorizer ngram range = 2,3 (80% Accuracy)

vect = CountVectorizer(ngram_range=(2,3),max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000)


In [14]:
X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

precision = round(100*precision_score(y_test, y_pred),3)
recall = round(100*recall_score(y_test, y_pred),3)
f1 = round(2 * (precision * recall) / (precision + recall),3)
accuracy = round(100*accuracy_score(y_test, y_pred),3)

print('Accuracy:', accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('F1:',f1)

Accuracy: 80.422
Precision: 79.129
Recall: 90.618
F1: 84.485


In [6]:
# Count Vectorizer ngram range = 1,3 (BEST,89.2% Accuracy)

vect = CountVectorizer(ngram_range=(1,3),max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000)

In [7]:
X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

precision = round(100*precision_score(y_test, y_pred),3)
recall = round(100*recall_score(y_test, y_pred),3)
f1 = round(2 * (precision * recall) / (precision + recall),3)
accuracy = round(100*accuracy_score(y_test, y_pred),3)

print('Accuracy:', accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('F1:',f1)

Accuracy: 88.927
Precision: 89.622
Recall: 91.806
F1: 90.701


In [22]:
y_test.value_counts()

1    21296
0    14908
Name: target, dtype: int64

In [24]:
confusion_matrix(y_test,y_pred,labels=[1,0])

array([[19608,  1688],
       [ 2239, 12669]])

In [25]:
filename = 'LR_89.2%'
pickle.dump(model, open(filename, 'wb'))

## Logistic Regression Grid Search

In [22]:
from warnings import filterwarnings
filterwarnings('ignore')

In [24]:
# GridSearch for best parameters

logit_params = {'penalty': ['none', 'l2'], 'solver': ['lbfgs'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]}

gs_logit = GridSearchCV(LogisticRegression(), logit_params, cv=3, scoring='f1')
gs_logit.fit(X_train_vec, y_train)
y_true, y_pred = y_test, gs_logit.predict(X_test_vec)

print('Best Parameters are:', gs_logit.best_params_)
print('accuracy =', accuracy_score(y_true, y_pred))
print('precision = ', precision_score(y_true, y_pred))
print('recall = ', recall_score(y_true, y_pred))
print('f1 score = ', f1_score(y_true, y_pred))

Best Parameters are: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
accuracy = 0.8536625787205834
precision =  0.8475408411539799
recall =  0.9159936138241923
f1 score =  0.8804387073478968


In [6]:
# train model with optimal parameters

vect = CountVectorizer(ngram_range=(1,3),max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000, solver='lbfgs', penalty='l2', C=0.01)

In [7]:
X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

precision = round(100*precision_score(y_test, y_pred),3)
recall = round(100*recall_score(y_test, y_pred),3)
f1 = round(2 * (precision * recall) / (precision + recall),3)
accuracy = round(100*accuracy_score(y_test, y_pred),3)

print('Accuracy:', accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('F1:',f1)

Accuracy: 85.366
Precision: 84.754
Recall: 91.599
F1: 88.044


## Classifying Unseen Data

In [9]:
# load unseen dataset & assign to X

usa = pd.read_pickle("../usa_tweets_demo.pkl")
X = usa['text']

In [10]:
# vectorize X

X_vec_pred = vect.transform(X)

In [11]:
# predict probabilities for the new data

y_pred = model.predict_proba(X_vec_pred)

In [12]:
# classify data based on 0.5 threshold

believer_denier_preds = []

for i in y_pred:
    if i[1] < 0.5:
        believer_denier_preds.append(0)
    else:
        believer_denier_preds.append(1)

In [31]:
# add classifications to the dataset

usa['believer_denier'] = believer_denier_preds
usa['believer_denier'].value_counts()

1    111470
0     57990
Name: believer_denier, dtype: int64

In [13]:
# add classifications to the dataset

usa['believer_denier'] = believer_denier_preds
usa['believer_denier'].value_counts()

1    100749
0     52196
Name: believer_denier, dtype: int64

In [14]:
usa[['text','believer_denier']]

Unnamed: 0,text,believer_denier
0,ll juice left carrot tonight fresh juice morni...,0
1,climate fact course warm year concern learn cl...,1
2,well start plant gogreen eco optoutside actonc...,1
3,planet great winner world move ahead actonclim...,1
4,climatechange challenge urban challenge watch ...,1
...,...,...
152940,savage energy partner record break fiscal quar...,1
152941,hard tell snakeoil chemtrail globalwarmingisah...,0
152942,standard winter hurricane warn part florida yi...,1
152943,happy new yeaя fan globalwarmingisahoax hoax f...,0


In [39]:
# sanity check classifications

believers = usa.loc[usa['believer_denier'] == 1]
believers = believers.reset_index(drop=True)

deniers = usa.loc[usa['believer_denier'] == 0]
deniers = deniers.reset_index(drop=True)

In [59]:
believers['text'][1]

'bike firstnight firstnightmonterey mayor clyde roberson monterey commit renewable energy roberson say paris accord happynewyear actonclimate climatechange'

In [67]:
believers['text'][13]

'climate change real start work solution instead get stick argue sciencehate conspiracylove family member aka holiday climatechangeisreal'

In [61]:
deniers['text'][50]

'love fail tweet rest think global warming newsworthy frigid day brrrrrrr maga'

In [51]:
deniers['text'][12]

'wowcow global warmingcoolingclimate changetcot climatechange hoax'

In [16]:
# save classified dataset

usa.to_pickle("../data/usa_classified_tweets.pkl")