## Import Libraries & Data

In [9]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
import html
import re
import string
import spacy
import math

import wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
import os

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from warnings import filterwarnings
filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_pickle("../data/training_data.pkl")

In [3]:
df.head()

Unnamed: 0,hashtags,favorite_count,id,lang,place,retweet_count,text,user_location,tweet_proc_length,target
0,"[tcot, tlot, climatechange, solar]",0,1010598407596072962,en,,0,icymi learn mandate rooftop solar power cause ...,"Washington, D.C.",19,0
1,"[nationalhurricanecenter, climatechange, hurri...",1,1035418425047298049,en,,0,nationalhurricanecenter hype storm convince cl...,USA,39,0
2,"[theresistance, altleft, antifa, waronwomen, f...",0,968236700794175488,en,,0,hey theresistance altleft antifa waronwomen fe...,,24,0
3,"[nos, eenvandaag, nieuwsuur, groenlinks, jinek...",0,968239413770760194,en,,0,de jaren ge het globaal koeling compilation ne...,,33,0
4,"[skybastard, idonotconsent, wedonotconsent, op...",1,1067506402543964161,en,,1,lovely little skybastard idonotconsent wedonot...,Right Here......,21,0


In [4]:
df['target'].value_counts()

1    82735
0    74462
Name: target, dtype: int64

In [5]:
col_list = ['text','target']
df = df.loc[:, col_list]

In [6]:
df

Unnamed: 0,text,target
0,icymi learn mandate rooftop solar power cause ...,0
1,nationalhurricanecenter hype storm convince cl...,0
2,hey theresistance altleft antifa waronwomen fe...,0
3,de jaren ge het globaal koeling compilation ne...,0
4,lovely little skybastard idonotconsent wedonot...,0
...,...,...
157192,tonight feel like canada strange turn learn ha...,1
157193,lead cause climate change thing far animal agr...,1
157194,curious date global air temperature climate ch...,1
157195,sign climatechange climateemergencynow,1


In [7]:
X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=100)

In [8]:
# make stopwords list

definitive_hashtags = ['climatechangeisreal', 'actonclimate', 'extinctionrebellion', 'climateemergency', 
                 'climateactionnow', 'capitalism', 'public_health', 'climateaction', 'humanityextinction',
                 'activism', 'noplanetb', 'savetheplanet', 'climateaction','climatechangeisfalse', 
                 'climatechangenotreal', 'climatechangehoax','globalwarminghoax', 'tcot', 'ccot', 'tlot', 
                 'pjnet', 'rednationrising', 'votered','libtard', 'libtards', 'maga', 'climatedeniers', 
                 'climatehoax', 'globalcooling','climatechangescam', 'climatehysteria', 'globalwarmingisahoax', 
                 'globalwarmingscam', 'globalcooling']

stop_words = set(stopwords.words("english"))
stopwords_all = stop_words.union(definitive_hashtags)

## Baseline Model (81.5% Accuracy)

In [74]:
from sklearn.naive_bayes import MultinomialNB

In [75]:
# create a count vectorizer
vect = CountVectorizer(max_features=1000,stop_words=stopwords_all)

# vectorize train and test sets
xtrain_count = vect.fit_transform(X_train)
xtest_count = vect.transform(X_test)

# fit the training dataset on the NB classifier
baseline = MultinomialNB()
baseline.fit(xtrain_count, y_train)

# predict the labels on test set and get evaluation scores
baseline_pred = baseline.predict(xtest_count)

print(classification_report(y_test, baseline_pred))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80     14893
           1       0.81      0.84      0.83     16547

    accuracy                           0.81     31440
   macro avg       0.81      0.81      0.81     31440
weighted avg       0.81      0.81      0.81     31440



In [76]:
confusion_matrix(y_test, baseline_pred, labels = [1,0])

array([[13954,  2593],
       [ 3312, 11581]])

In [77]:
filename = 'BaselineNB_81.5%'
pickle.dump(baseline, open(filename, 'wb'))

## Logistic Regression + TF-IDF

In [78]:
# TF-IDF ngram range = 1,3 (Accuracy 88%)

vect = TfidfVectorizer(ngram_range=(1,3), max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000)

X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87     14893
           1       0.88      0.90      0.89     16547

    accuracy                           0.88     31440
   macro avg       0.88      0.88      0.88     31440
weighted avg       0.88      0.88      0.88     31440



In [79]:
# TF-IDF ngram range = 2,3 (Accuracy 79%)

vect = TfidfVectorizer(ngram_range=(2,3), max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000)

X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.69      0.76     14893
           1       0.76      0.88      0.82     16547

    accuracy                           0.79     31440
   macro avg       0.80      0.79      0.79     31440
weighted avg       0.80      0.79      0.79     31440



## Logistic Regression + CountVectorizer

In [80]:
# Count Vectorizer ngram range = 2,3 (80% Accuracy)

vect = CountVectorizer(ngram_range=(2,3),max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000)

X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80     14893
           1       0.84      0.78      0.81     16547

    accuracy                           0.80     31440
   macro avg       0.80      0.80      0.80     31440
weighted avg       0.81      0.80      0.80     31440



In [81]:
# Count Vectorizer ngram range = 1,3 (BEST,88% Accuracy)

vect = CountVectorizer(ngram_range=(1,3),max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000)

X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88     14893
           1       0.89      0.90      0.89     16547

    accuracy                           0.89     31440
   macro avg       0.89      0.88      0.89     31440
weighted avg       0.89      0.89      0.89     31440



In [82]:
confusion_matrix(y_test,y_pred,labels=[1,0])

array([[14861,  1686],
       [ 1910, 12983]])

In [83]:
filename = 'LR_89.2%'
pickle.dump(model, open(filename, 'wb'))

## Logistic Regression Grid Search

In [12]:
# GridSearch for best parameters

vect = CountVectorizer(ngram_range=(1,3),max_df = 0.8, min_df = 3, stop_words=stopwords_all)

X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

params = {'penalty': ['none', 'l2'], 'solver': ['liblinear'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]}

grid = GridSearchCV(LogisticRegression(), params, cv=3, scoring='f1')
grid.fit(X_train_vec, y_train)
y_true, y_pred = y_test, grid.predict(X_test_vec)

print('Best Parameters are:', grid.best_params_)
print(classification_report(y_true, y_pred))

Best Parameters are: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.89      0.87      0.88     14893
           1       0.89      0.90      0.89     16547

    accuracy                           0.89     31440
   macro avg       0.89      0.88      0.89     31440
weighted avg       0.89      0.89      0.89     31440



In [13]:
# train model with optimal parameters

vect = CountVectorizer(ngram_range=(1,3),max_df = 0.8, min_df = 3, stop_words=stopwords_all)

model = LogisticRegression(max_iter=1000, solver='liblinear', penalty='l2', C=1)

X_train_vec = vect.fit_transform(X_train)
X_test_vec  = vect.transform(X_test)

model.fit(X_train_vec, y_train);
y_pred = model.predict(X_test_vec)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88     14893
           1       0.89      0.90      0.89     16547

    accuracy                           0.89     31440
   macro avg       0.89      0.88      0.89     31440
weighted avg       0.89      0.89      0.89     31440



## Classifying Unseen Data

In [14]:
# load unseen dataset & assign to X

usa = pd.read_pickle("../data/usa_tweets_demo.pkl")
X = usa['text']

In [15]:
# vectorize X

X_vec_pred = vect.transform(X)

In [16]:
# predict probabilities for the new data

y_pred = model.predict_proba(X_vec_pred)

In [17]:
# classify data based on 0.5 threshold

believer_denier_preds = []

for i in y_pred:
    if i[1] < 0.5:
        believer_denier_preds.append(0)
    else:
        believer_denier_preds.append(1)

In [52]:
# add classifications to the dataset

usa['believer_denier'] = believer_denier_preds
usa['believer_denier'].value_counts()

1    84965
0    65833
Name: believer_denier, dtype: int64

In [18]:
# add classifications to the dataset

usa['believer_denier'] = believer_denier_preds
usa['believer_denier'].value_counts()

1    90224
0    60574
Name: believer_denier, dtype: int64

In [19]:
usa[['text','believer_denier']]

Unnamed: 0,text,believer_denier
0,ll juice left carrot tonight fresh juice morni...,1
1,climate fact course warm year concern learn cl...,1
2,planet great winner world move ahead actonclim...,1
3,teen activist meet staff ve lose faith humanit...,1
4,rescue refugee land sea fleeing conflict need ...,1
...,...,...
150793,savage energy partner record break fiscal quar...,1
150794,hard tell snakeoil chemtrail globalwarmingisah...,0
150795,standard winter hurricane warn part florida yi...,1
150796,happy new yeaя fan globalwarmingisahoax hoax f...,0


In [20]:
# sanity check classifications

believers = usa.loc[usa['believer_denier'] == 1]
believers = believers.reset_index(drop=True)

deniers = usa.loc[usa['believer_denier'] == 0]
deniers = deniers.reset_index(drop=True)

In [21]:
believers['text'][5]

'suggest read thread climatechangeisreal ignore'

In [22]:
believers['text'][50]

'prevent bad consequence climate change ahead mean action climatechange actonclimate peoplesclimate rochesterny roc'

In [27]:
deniers['text'][3]

'globalwarminghoax believer repent link video climatebarbie head south'

In [47]:
deniers['text'][140]

'conservative say environmentalist want clean air water global warming hoax story maga'

In [48]:
# save classified dataset

usa.to_pickle("../data/usa_classified_tweets.pkl")