# Importing Libraries & Data

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
import html
import re
import string
import spacy
import math

import wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import os

# import random undersampling and other necessary libraries 
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_pickle("../full_dataset.pkl")

# Text Cleaning #
- remove special characters, hashtags
- turn everything to lowercase
- replace contradictions
- remove mentions, newlines, emojis, URL's, encoding characters, numbers, punctuation
- remove retweets
- remove extra spaces
- remove rows with 3 or less words
- remove duplicates
- lemmatize & tokenize text

In [3]:
df['text']

0         In less than 3 days we will #Hack4Climate at @...
1         "The best way to predict the future is to crea...
2         Accepting the world for the way it is just mig...
3         Landfills Are Significant Sources of Methane E...
4         "Sea level expected to rise by one (1) metre b...
                                ...                        
940178    @DrJenGunter It’s hard to tell if our #snakeoi...
940180    HAPPY NEW YEAЯ, fans!! 🇺🇸🎊🇺🇸🎊🇺🇸🎊🇺🇸🎊🇺🇸🎊#globalw...
940181    It's September 23 and it's almost 90 F in Buff...
940182    We are in the middle of a real planet crisis s...
Name: text, Length: 940183, dtype: object

In [4]:
#remove special characters
df['text'] = df['text'].str.replace(r'[\",]*', '',regex=True)

#remove hashtags
df['text'] = df['text'].replace('([#])','', regex=True)
#df['text'] = df['text'].replace('([A-Z][a-z]+)', r' \1', regex=True) 

#lowercase
df['text'] = df['text'].str.lower()

In [5]:
#replace contradictions

contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the reviews
df['text']=df['text'].apply(lambda x:expand_contractions(x))

In [6]:
#remove mentions
df['text'] = df['text'].replace("@[A-Za-z0-9_]+", "", regex=True)

#remove newlines
df['text'] = df['text'].replace("(\r\n|\r|\n)", "", regex=True)

#remove encoding characters
df['text'] = df['text'].replace(r'\b[a-zA-Z]\b', '', regex=True)
df['text'] = df['text'].str.replace(r'[\'\",]*', '',regex=True)

#remove URL's
df['text'] = df['text'].replace(r"(?:\@|http?\://|https?\://|www)\S+", "", regex=True)

#remove emojis
df['text'] = df['text'].replace("[(\U0001F600-\U0001F92F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F190-\U0001F1FF|\U00002702-\U000027B0|\U0001F926-\U0001FA9F|\u200d|\u2640-\u2642|\u2600-\u2B55|\u23cf|\u23e9|\u231a|\ufe0f)]", "", regex=True)

#remove numbers
df['text'] = df['text'].replace("[0-9]", "", regex=True)

# remove punctuation
df['text'] = df['text'].str.replace(r'[^\w\s]',r'',regex=True)

#remove Retweets
df = df[~df['text'].str.startswith('RT')]

#remove spaces at the front and back
df['text'] = df['text'].str.strip()

# remove extra spaces
df['text'] = df['text'].str.replace(r'\s\s+',r' ',regex=True)

In [7]:
df['text'][0:30]

0     in less than days we will hackclimate at looki...
1     the best way to predict the future is to creat...
2     accepting the world for the way it is just mig...
3     landfills are significant sources of methane e...
4     sea level expected to rise by one metre by due...
5     be kind to everything that lives climatechange...
6     extreme storms to multiply intensify across ne...
7     good to your words of wisdom on thedrum tonigh...
8     microgrids allow houses to share energy they h...
9     thank you the endorsement together we can make...
10    arctic wellbeing essential for earth climatech...
11    why is the sun the only really safe nuclear re...
12    scotland yard investigates officers shown danc...
13    the alarm has been sounded is anyone listening...
14    what are some of the many reasons to protect f...
15    often wonder whether we as the people could br...
16    why forests are our planet lungs climatechange...
17    the eu aims to decarbonise the power secto

In [8]:
df.shape

(940183, 10)

In [9]:
#https://xiangyutang2.github.io/tweet-classification/
#drop rows with shorter than 3 words tweets

df['tweet_proc_length'] = [len(text.split(' ')) for text in df['text']]
df = df[df['tweet_proc_length']>3]
df.shape

(919058, 11)

In [10]:
df.isna().sum()

hashtags             212988
favorite_count            0
id                        0
lang                      0
place                625648
retweet_count             0
text                      0
user_location        112038
city                 703567
state                703567
tweet_proc_length         0
dtype: int64

In [11]:
#drop duplicates
df = df.drop_duplicates(subset=['text']).reset_index(drop=True)
df.shape

(610671, 11)

In [12]:
# Lemmatization & Tokenization

import spacy

tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# Loading model
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

def lemmatization(text):
    return [lemmatizer.lemmatize(x) for x in tokenizer.tokenize(text)]
    
df['text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop == False)]))

In [13]:
df['text']

0         day hackclimate look forward exciting event bl...
1         good way predict future create abraham lincoln...
2         accept world way power stand change want prote...
3         landfill significant source methane emission c...
4         sea level expect rise metre melt ice wwf clima...
                                ...                        
610666    savage energy partner record break fiscal quar...
610667    hard tell snakeoil chemtrail globalwarmingisah...
610668    standard winter hurricane warn part florida yi...
610669    happy new yeaя fan globalwarmingisahoax hoax f...
610670               september buffalo globalwarmingisahoax
Name: text, Length: 610671, dtype: object

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 610671 entries, 0 to 610670
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hashtags           423604 non-null  object
 1   favorite_count     610671 non-null  int64 
 2   id                 610671 non-null  int64 
 3   lang               610671 non-null  object
 4   place              256219 non-null  object
 5   retweet_count      610671 non-null  int64 
 6   text               610671 non-null  object
 7   user_location      511865 non-null  object
 8   city               169460 non-null  object
 9   state              169460 non-null  object
 10  tweet_proc_length  610671 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 51.2+ MB


## Separate into Training (non-USA) and Prediction (USA) tweets

In [15]:
# extract tweets that have city and state column
usa = df[~df['city'].isnull()]
usa

Unnamed: 0,hashtags,favorite_count,id,lang,place,retweet_count,text,user_location,city,state,tweet_proc_length
441211,climatechange climateaction sustainability fb,0,930123883520413697,en,,0,go warning people climatechange climateaction ...,"Holmdel, NJ",Holmdel,NJ,15
441212,juice reuse reduce actonclimate,1,965587085993365505,en,,1,ll juice left carrot tonight fresh juice morni...,"Miami, FL",Miami,FL,19
441213,FirstNight FirstNightMonterey Mayor Monterey H...,2,947957104576315392,en,,0,bike firstnight firstnightmonterey mayor clyde...,"Monterey County, CA, USA",Monterey County,CA,26
441214,climatechange ClimateChangeIsReal ClimateAction,0,930124070238040066,en,,0,climate fact course warm year concern learn cl...,"Spokane Valley, WA",Spokane Valley,WA,18
441215,ActOnClimate,4,942159912121102336,en,,1,planet great winner world move ahead actonclim...,"Washington, DC",Washington,DC,23
...,...,...,...,...,...,...,...,...,...,...,...
610666,frostythesnowman moistsnowballs globalwarming ...,0,1210060038380740608,en,"Houston, TX",0,savage energy partner record break fiscal quar...,,Houston,TX,23
610667,snakeoil chemtrails GlobalWarmingIsAHoax meteo...,2,996746444211011586,en,"Eugene, OR",0,hard tell snakeoil chemtrail globalwarmingisah...,Lands of the Kalapuya,Eugene,OR,27
610668,yikes everythingisnormal globalwarmingisahoax,0,948537051418214400,en,"Aspen, CO",0,standard winter hurricane warn part florida yi...,Aspen CO,Aspen,CO,13
610669,globalwarmingisahoax hoax fakenews love happyn...,1,947614408108335104,en,"Manhattan, NY",0,happy new yeaя fan globalwarmingisahoax hoax f...,"New York, USA",Manhattan,NY,10


In [None]:
usa.info()

In [17]:
cols = [0,1,3,4,5,7,10]
usa = usa.drop(usa.columns[cols], axis=1, inplace=True)

IndexError: index 4 is out of bounds for axis 0 with size 4

In [18]:
usa = usa.reset_index(drop=True)
usa

Unnamed: 0,id,text,city,state
0,930123883520413697,go warning people climatechange climateaction ...,Holmdel,NJ
1,965587085993365505,ll juice left carrot tonight fresh juice morni...,Miami,FL
2,947957104576315392,bike firstnight firstnightmonterey mayor clyde...,Monterey County,CA
3,930124070238040066,climate fact course warm year concern learn cl...,Spokane Valley,WA
4,942159912121102336,planet great winner world move ahead actonclim...,Washington,DC
...,...,...,...,...
169455,1210060038380740608,savage energy partner record break fiscal quar...,Houston,TX
169456,996746444211011586,hard tell snakeoil chemtrail globalwarmingisah...,Eugene,OR
169457,948537051418214400,standard winter hurricane warn part florida yi...,Aspen,CO
169458,947614408108335104,happy new yeaя fan globalwarmingisahoax hoax f...,Manhattan,NY


In [20]:
# save file

usa.to_pickle("usa_tweets.pkl")

In [19]:
# take them out of the main dataframe
condition = df['id'].isin(usa['id'])
df.drop(df[condition].index, inplace = True)
df

Unnamed: 0,hashtags,favorite_count,id,lang,place,retweet_count,text,user_location,city,state,tweet_proc_length
0,Hack4Climate Blockchain ClimateAction ClimateC...,4,928605716159574016,en,,1,day hackclimate look forward exciting event bl...,"Zürich, Schweiz",,,17
1,climatechange climateaction,1,954288658361802752,en,,1,good way predict future create abraham lincoln...,"Tampere, Finland",,,15
2,protectwhatyoulove climatechangeisreal AnimalR...,0,1012691887440814086,en,,0,accept world way power stand change want prote...,,,,31
3,climatechange climateaction,0,942162022145380352,en,,0,landfill significant source methane emission c...,Globally l Planet Earth,,,9
4,climatechange climateaction,0,954283347399847936,en,,1,sea level expect rise metre melt ice wwf clima...,Online l Globally l Earth,,,16
...,...,...,...,...,...,...,...,...,...,...,...
441206,WarOnError AGW ACC CO2 GlobalWarmingScam Propa...,0,951222777125724160,en,,0,waronerror episode promote illusive fear sell ...,,,,13
441207,GlobalWarmingSCAM,1,945668218496192512,en,,0,new study find cosmic ray solar activity great...,"Acapulco, Mexico",,,19
441208,GlobalWarming ClimateChange ClimateScam Global...,0,953134860922744832,en,,0,global warming expert think melt ice cause sea...,,,,19
441209,blizzard2018 noreaster GlobalWarmingScam,0,948942488751886336,en,,0,blizzard noreaster oh day al gore democrats gl...,USA,,,22


In [21]:
# Drop NA's in hashtags column

df = df.dropna(how='any', subset=['hashtags'])
df.shape

(339349, 11)

In [22]:
# Turn hashtags into lists with all lowercase

df['hashtags'] = df['hashtags'].str.lower()
df['hashtags'] = df.apply(lambda row:  row['hashtags'].replace(' ', ',').split(','), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashtags'] = df['hashtags'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashtags'] = df.apply(lambda row:  row['hashtags'].replace(' ', ',').split(','), axis=1)


In [23]:
# Create a list of all hashtags

hashtags_all = []
for row in df['hashtags']:
    for item in row:
        hashtags_all.append(item.lower())
        
# see hashtag count and sort by frequency
unique_hashtags = list(pd.Series(hashtags_all).value_counts().index)
# view top 5 for reference
print(unique_hashtags[:5])

['climatechange', 'climateaction', 'actonclimate', 'climatechangeisreal', 'climatehoax']


In [24]:
# Believer Hashtags
believer_tags = ['climatechangeisreal', 'actonclimate', 'extinctionrebellion', 'climateemergency', 
                 'climateactionnow', 'capitalism', 'public_health', 'climateaction', 'humanityextinction',
                 'activism', 'noplanetb', 'savetheplanet', 'climateaction']

# Denier Hashtags
denier_tags = ['climatechangeisfalse', 'climatechangenotreal', 'climatechangehoax', 
               'globalwarminghoax', 'tcot', 'ccot', 'tlot', 'pjnet', 'rednationrising', 'votered', 
               'libtard', 'libtards', 'maga', 'climatedeniers', 'climatehoax', 'globalcooling',
              'climatechangescam', 'climatehysteria', 'globalwarmingisahoax', 'globalwarmingscam', 'globalcooling']

In [25]:
# assign believer status to each row in the dataset
believe_series = []

for idx, row in df['hashtags'].iteritems():
    # set a count for matching tags in row
    believe = 0
    deny = 0 
    for tag in row:
        if tag.lower() in denier_tags:
            deny += 1
        elif tag.lower() in believer_tags:
            believe += 1
    
    # check that only one type of tag appears
    if (believe > 0) and (deny == 0):
        believe_series.append(1)
    elif (believe == 0) and (deny > 0):
        believe_series.append(0)
    else: # <-- if no matching tag OR if both appear, label as unknown using nonetype 
        believe_series.append(None)

In [26]:
# create target column

df = df.assign(target = believe_series)

# Remove NA's

df_believer_status = df.dropna(subset=['target'])

# subset dataframe to only include positive and negative cases

df = df.dropna(subset = ['target'])

df['target'].value_counts()

1.0    202518
0.0     74537
Name: target, dtype: int64

In [27]:
# Turn target column into target labels of 0 and 1

df['target'] = df['target'].astype(int)

In [28]:
df['target'].value_counts()

1    202518
0     74537
Name: target, dtype: int64

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 277055 entries, 0 to 441210
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hashtags           277055 non-null  object
 1   favorite_count     277055 non-null  int64 
 2   id                 277055 non-null  int64 
 3   lang               277055 non-null  object
 4   place              7780 non-null    object
 5   retweet_count      277055 non-null  int64 
 6   text               277055 non-null  object
 7   user_location      219182 non-null  object
 8   city               0 non-null       object
 9   state              0 non-null       object
 10  tweet_proc_length  277055 non-null  int64 
 11  target             277055 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 27.5+ MB


In [30]:
cols = [0,1,3,4,5,7,8,9,10]
df.drop(df.columns[cols], axis=1, inplace=True)

In [31]:
df

Unnamed: 0,id,text,target
0,928605716159574016,day hackclimate look forward exciting event bl...,1
1,954288658361802752,good way predict future create abraham lincoln...,1
2,1012691887440814086,accept world way power stand change want prote...,1
3,942162022145380352,landfill significant source methane emission c...,1
4,954283347399847936,sea level expect rise metre melt ice wwf clima...,1
...,...,...,...
441206,951222777125724160,waronerror episode promote illusive fear sell ...,0
441207,945668218496192512,new study find cosmic ray solar activity great...,0
441208,953134860922744832,global warming expert think melt ice cause sea...,0
441209,948942488751886336,blizzard noreaster oh day al gore democrats gl...,0


# Undersampling for Class Imbalance

In [32]:
# Separating the independent variables from dependent variables
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [33]:
X

Unnamed: 0,id,text
0,928605716159574016,day hackclimate look forward exciting event bl...
1,954288658361802752,good way predict future create abraham lincoln...
2,1012691887440814086,accept world way power stand change want prote...
3,942162022145380352,landfill significant source methane emission c...
4,954283347399847936,sea level expect rise metre melt ice wwf clima...
...,...,...
441206,951222777125724160,waronerror episode promote illusive fear sell ...
441207,945668218496192512,new study find cosmic ray solar activity great...
441208,953134860922744832,global warming expert think melt ice cause sea...
441209,948942488751886336,blizzard noreaster oh day al gore democrats gl...


In [34]:
y

0         1
1         1
2         1
3         1
4         1
         ..
441206    0
441207    0
441208    0
441209    0
441210    0
Name: target, Length: 277055, dtype: int64

In [35]:
# Random Undersampling

# summarize class distribution
print("Before undersampling: ", Counter(y))

# define undersampling strategy
undersample = RandomUnderSampler(sampling_strategy=0.7)

# fit and apply the transform
X_under, y_under = undersample.fit_resample(X, y)

# summarize class distribution
print("After undersampling: ", Counter(y_under))

Before undersampling:  Counter({1: 202518, 0: 74537})
After undersampling:  Counter({1: 106481, 0: 74537})


In [36]:
believer_percent = round(106481 * 100 / 181018,1)
denier_percent = round(74537 * 100 / 181018,1)

print('class distribution:')
print('  believers:', believer_percent,'%')
print('  deniers:', denier_percent,'%')

class distribution:
  believers: 58.8 %
  deniers: 41.2 %


In [37]:
df = pd.concat([X_under, y_under],axis = 1, join = 'outer', 
                 ignore_index=False, sort=False)

In [38]:
df['target'].value_counts()

1    106481
0     74537
Name: target, dtype: int64

In [39]:
# save processed dataset

df.to_pickle("../data/cleaned_lemmatized.pkl")