In [1]:
import numpy as np
import os
import pandas as pd
import re
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia
from wordcloud import WordCloud,STOPWORDS
from nltk.util import ngrams
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy as sp

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nszoni/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Import data

In [2]:
rc = pd.read_csv('../data/reddit_ct.csv')
rc.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,What Shape is The Earth? PYRAMID!,1,b94br6,https://www.youtube.com/watch?v=DmMl7xwqTWA,0,1554329000.0,,2019-04-04 00:58:13
1,"Bitcoin & XRP Conspiracy, CIA & New World Orde...",1,a6ly3q,https://www.youtube.com/attribution_link?a=YdV...,0,1544932000.0,,2018-12-16 05:47:19
2,Is this a thing yet?,2,bbrgsd,https://www.reddit.com/r/ConspiracyTheory/comm...,0,1554931000.0,[https://sputniknews.com/science/2019041010740...,2019-04-11 00:13:57
3,Comment,1,du4rdjb,,0,1518452000.0,"In a Swedish city, the same pattern: \n""Truth ...",2018-02-12 18:06:02
4,"Jimmy Fallon might be in the closet gay, or Bi...",13,f2r2s5,https://www.reddit.com/r/ConspiracyTheory/comm...,21,1581513000.0,Recently me and some of my friends have been n...,2020-02-12 15:17:50


## Text Processing

In [12]:
# Separate to title and body
title_data = rc[['title','timestamp']].copy()
body_data = rc[['body','timestamp']].copy()
body_data = body_data.dropna()
title_data = title_data.dropna()

#Cast timestamp to date
title_data.timestamp = pd.to_datetime(title_data.timestamp).dt.date
body_data.timestamp = pd.to_datetime(body_data.timestamp).dt.date

In [22]:
lemmatizer = WordNetLemmatizer()

def text_preprocess_lemmatize(text: str) -> str:
    
    '''
    - remove handlers (eg. @username)
    - remove urls
    - remove any non-word elements (inc. punctuation)
    - remove single letters
    - remove digits
    - replace multiple spaces with a single space
    - tokenize text
    - filter out stopwords
    - lemmatize tokens
    - filter out tokens with less than 3 characters
    '''
    garbage = re.compile('@[^\s]+|http\S+|\W|\s+[a-zA-Z]\s+|\d+|\s+')
    
    text = text.lower()
    text= re.sub(garbage,' ', text)
    
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in set(stopwords.words('english'))]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [word for word in tokens if len(word)>=3]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text 

In [13]:
body_data.head()

Unnamed: 0,body,timestamp
2,[https://sputniknews.com/science/2019041010740...,2019-04-11
3,"In a Swedish city, the same pattern: \n""Truth ...",2018-02-12
4,Recently me and some of my friends have been n...,2020-02-12
6,"I have marked a rudimentary chart, each new in...",2020-02-08
12,[https://www.youtube.com/watch?v=Jx9VWZdryjE]...,2020-01-29


In [24]:
body_data['cleaned'] = body_data['body'].apply(text_preprocess_lemmatize)

In [39]:
body_data.head()

Unnamed: 0,body,timestamp,cleaned,sentiments,positive,neutral,negative
3,"In a Swedish city, the same pattern: \n""Truth ...",2018-02-12,swedish city pattern truth freemason call shot...,"{'neg': 0.144, 'neu': 0.682, 'pos': 0.174, 'co...",0.174001,0.682001,0.144001
4,Recently me and some of my friends have been n...,2020-02-12,recently friend noticing jimmy fallon look men...,"{'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'comp...",0.394001,0.606001,1e-06
6,"I have marked a rudimentary chart, each new in...",2020-02-08,marked rudimentary chart new infection date on...,"{'neg': 0.067, 'neu': 0.904, 'pos': 0.029, 'co...",0.029001,0.904001,0.067001
16,Anyone have any theories they’d wanna propose ...,2020-01-27,anyone theory wan propose regarding outbreak p...,"{'neg': 0.261, 'neu': 0.739, 'pos': 0.0, 'comp...",1e-06,0.739001,0.261001
17,Am I the only one wondering if the Coronavirus...,2020-01-25,one wondering coronavirus china biological lau...,"{'neg': 0.328, 'neu': 0.672, 'pos': 0.0, 'comp...",1e-06,0.672001,0.328001


In [38]:
#drop rows where cleaned is null
body_data['cleaned'].replace('', np.nan, inplace=True)
body_data = body_data[body_data['cleaned'].notna()]

## Feature Engineering

In [51]:
# Apply SIA from Vader package

#TODO: do it on raw text
sid = sia()
body_data['sentiments'] = body_data['body'].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',x.lower()))))

In [52]:
body_data.head()

Unnamed: 0,body,timestamp,cleaned,sentiments,positive,neutral,negative,num_words,num_stopwords,avg_word_length,compound
3,"In a Swedish city, the same pattern: \n""Truth ...",2018-02-12,swedish city pattern truth freemason call shot...,"{'neg': 0.073, 'neu': 0.838, 'pos': 0.089, 'co...",0.174,0.682,0.144,63,27,9.111111,0.296
4,Recently me and some of my friends have been n...,2020-02-12,recently friend noticing jimmy fallon look men...,"{'neg': 0.018, 'neu': 0.788, 'pos': 0.194, 'co...",0.394,0.606,0.0,97,48,5.244898,0.9661
6,"I have marked a rudimentary chart, each new in...",2020-02-08,marked rudimentary chart new infection date on...,"{'neg': 0.051, 'neu': 0.935, 'pos': 0.014, 'co...",0.029,0.904,0.067,1192,184,1.719246,-0.8555
16,Anyone have any theories they’d wanna propose ...,2020-01-27,anyone theory wan propose regarding outbreak p...,"{'neg': 0.145, 'neu': 0.855, 'pos': 0.0, 'comp...",0.0,0.739,0.261,35,11,6.291667,-0.6124
17,Am I the only one wondering if the Coronavirus...,2020-01-25,one wondering coronavirus china biological lau...,"{'neg': 0.163, 'neu': 0.837, 'pos': 0.0, 'comp...",0.0,0.672,0.328,22,10,5.083333,-0.5994


In [53]:
# Parse out dict elements

body_data['positive'] = body_data['sentiments'].apply(lambda x: x['pos']) 
body_data['neutral'] = body_data['sentiments'].apply(lambda x: x['neu'])
body_data['negative'] = body_data['sentiments'].apply(lambda x: x['neg'])
body_data['compound'] = body_data['sentiments'].apply(lambda x: x['compound'])

In [54]:
body_data.head()

Unnamed: 0,body,timestamp,cleaned,sentiments,positive,neutral,negative,num_words,num_stopwords,avg_word_length,compound
3,"In a Swedish city, the same pattern: \n""Truth ...",2018-02-12,swedish city pattern truth freemason call shot...,"{'neg': 0.073, 'neu': 0.838, 'pos': 0.089, 'co...",0.089,0.838,0.073,63,27,9.111111,0.296
4,Recently me and some of my friends have been n...,2020-02-12,recently friend noticing jimmy fallon look men...,"{'neg': 0.018, 'neu': 0.788, 'pos': 0.194, 'co...",0.194,0.788,0.018,97,48,5.244898,0.9598
6,"I have marked a rudimentary chart, each new in...",2020-02-08,marked rudimentary chart new infection date on...,"{'neg': 0.051, 'neu': 0.935, 'pos': 0.014, 'co...",0.014,0.935,0.051,1192,184,1.719246,-0.9451
16,Anyone have any theories they’d wanna propose ...,2020-01-27,anyone theory wan propose regarding outbreak p...,"{'neg': 0.145, 'neu': 0.855, 'pos': 0.0, 'comp...",0.0,0.855,0.145,35,11,6.291667,-0.4404
17,Am I the only one wondering if the Coronavirus...,2020-01-25,one wondering coronavirus china biological lau...,"{'neg': 0.163, 'neu': 0.837, 'pos': 0.0, 'comp...",0.0,0.837,0.163,22,10,5.083333,-0.5994


We will consider posts with a compound value greater than 0.2 as positive and less than -0.2 as negative. There's some testing and experimentation that goes with choosing these ranges, and there is a trade-off to be made here. If you choose a higher value, you might get more compact results (less false positives and false negatives), but the size of the results will decrease significantly.

Let's create a positive label of 1 if the compound is greater than 0.2, and a label of -1 if compound is less than -0.2. Everything else will be 0.

In [55]:
THRESHOLD = 0.2

conditions = [
    (body_data['compound'] <= -THRESHOLD),
    (body_data['compound'] > -THRESHOLD) & (body_data['compound'] < THRESHOLD),
    (body_data['compound'] >= THRESHOLD),
    ]

values = ["neg", "neu", "pos"]
body_data['label'] = np.select(conditions, values)

In [56]:
body_data.head()

Unnamed: 0,body,timestamp,cleaned,sentiments,positive,neutral,negative,num_words,num_stopwords,avg_word_length,compound,label
3,"In a Swedish city, the same pattern: \n""Truth ...",2018-02-12,swedish city pattern truth freemason call shot...,"{'neg': 0.073, 'neu': 0.838, 'pos': 0.089, 'co...",0.089,0.838,0.073,63,27,9.111111,0.296,pos
4,Recently me and some of my friends have been n...,2020-02-12,recently friend noticing jimmy fallon look men...,"{'neg': 0.018, 'neu': 0.788, 'pos': 0.194, 'co...",0.194,0.788,0.018,97,48,5.244898,0.9598,pos
6,"I have marked a rudimentary chart, each new in...",2020-02-08,marked rudimentary chart new infection date on...,"{'neg': 0.051, 'neu': 0.935, 'pos': 0.014, 'co...",0.014,0.935,0.051,1192,184,1.719246,-0.9451,neg
16,Anyone have any theories they’d wanna propose ...,2020-01-27,anyone theory wan propose regarding outbreak p...,"{'neg': 0.145, 'neu': 0.855, 'pos': 0.0, 'comp...",0.0,0.855,0.145,35,11,6.291667,-0.4404,neg
17,Am I the only one wondering if the Coronavirus...,2020-01-25,one wondering coronavirus china biological lau...,"{'neg': 0.163, 'neu': 0.837, 'pos': 0.0, 'comp...",0.0,0.837,0.163,22,10,5.083333,-0.5994,neg


### Feature Extraction

In [47]:
body_data['num_words'] = body_data['body'].apply(lambda x: len(x.split(' ')))
body_data['num_stopwords'] = body_data['body'].apply(lambda x: len([word for word in x.split(' ') if word in list(STOPWORDS)]))
body_data['avg_word_length'] = body_data['body'].apply(lambda x: np.mean(np.array([len(va) for va in x.split(' ') if va not in list(STOPWORDS)])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  body_data['num_words'] = body_data['body'].apply(lambda x: len(x.split(' ')))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  body_data['num_stopwords'] = body_data['body'].apply(lambda x: len([word for word in x.split(' ') if word in list(STOPWORDS)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [48]:
body_data.head()

Unnamed: 0,body,timestamp,cleaned,sentiments,positive,neutral,negative,num_words,num_stopwords,avg_word_length
3,"In a Swedish city, the same pattern: \n""Truth ...",2018-02-12,swedish city pattern truth freemason call shot...,"{'neg': 0.144, 'neu': 0.682, 'pos': 0.174, 'co...",0.174,0.682,0.144,63,27,9.111111
4,Recently me and some of my friends have been n...,2020-02-12,recently friend noticing jimmy fallon look men...,"{'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'comp...",0.394,0.606,0.0,97,48,5.244898
6,"I have marked a rudimentary chart, each new in...",2020-02-08,marked rudimentary chart new infection date on...,"{'neg': 0.067, 'neu': 0.904, 'pos': 0.029, 'co...",0.029,0.904,0.067,1192,184,1.719246
16,Anyone have any theories they’d wanna propose ...,2020-01-27,anyone theory wan propose regarding outbreak p...,"{'neg': 0.261, 'neu': 0.739, 'pos': 0.0, 'comp...",0.0,0.739,0.261,35,11,6.291667
17,Am I the only one wondering if the Coronavirus...,2020-01-25,one wondering coronavirus china biological lau...,"{'neg': 0.328, 'neu': 0.672, 'pos': 0.0, 'comp...",0.0,0.672,0.328,22,10,5.083333


## Sentiment Analysis

## Topic Modelling