In [None]:
# packages

# standard numerical
import numpy as np
import pandas as pd

# visualisation
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
from wordcloud import WordCloud, ImageColorGenerator

# to process text
import re 
import nltk
import json
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import operator
import datetime
import unidecode
from pprint import pprint
from collections import defaultdict
import collections
import os

# to carry out topic modeling
import gensim
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import ldamodel as lda

# to carry out sentiment analysis
nltk.download("vader_lexicon")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
# to install a package you don't have installed (remove the hashtags below to run)

# import sys
# !{sys.executable} -m pip install gensim

In [None]:
# load the dataset (available from Moodle, need to load the data into Google Colab)

data = pd.read_csv("Class4.csv", sep=';').set_index('id')

data.head()

In [None]:
# inspect the data

data.info()

In [None]:
# Change abstract data to a list as easier to work with

data1 = data['text'].values.tolist()

# Note 'pprint' is just a more visually appealing version of print - standing for 'pretty print'

pprint(data1[:3])

In [None]:
# now a big block of basic text pre-processing
# this code will work with any text, and is a recommended starting point for text pre-processing

# I've included a timer here, to get an idea of how long it takes to process
# Note: this timer has an end-line at end of the code

start = datetime.datetime.now()

data2 = []

for tweet in range(0, len(data1)):
    
    # Remove all the special characters
    p_data = re.sub(r'\W', ' ', str(data1[tweet]))
    p_data = unidecode.unidecode(p_data)

    # remove all single characters
    p_data = re.sub(r'\s+[a-zA-Z]\s+', ' ', p_data)
    p_data = re.sub(r'\^[a-zA-Z]\s+', ' ', p_data) 
    
    # remove all numbers
    p_data = re.sub(r'\d+','', p_data) 

    # substitute multiple white spaces with single space
    p_data = re.sub(r'\s+', ' ', p_data, flags=re.I)

    # Remove prefixed 'b'
    p_data = re.sub(r'^b\s+', '', p_data)
    

    data2.append(p_data)
    
    
print('Basic pre-processing of dataset took %s' % str(datetime.datetime.now() - start))

pprint(data2[:3])

In [None]:
# Pre-processing: Change tweets to bag-of-words

def sent_to_words(tweets):
    for tweet in tweets:
        yield(gensim.utils.simple_preprocess(str(tweet), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data2))

count = sum([len(tweet) for tweet in data_words])
print('Total number of terms across all tweets is: ', count)

print(data_words[:3])

In [None]:
# remove stopwords

start = datetime.datetime.now()

nltk.download('stopwords')
stop_words = stopwords.words('english') # nltk stopwords - about 180 stopwords

stop_words.extend(['rt', 'gt', 'group', 'june', 'years', 'right', 'another', 'emini', 'say', 'gnus',
                  'join', 'link', 'nq', 'agnes', 'per', 'lakshmi_']) # personal stopwords based on text inspection

# define a function to remove stopwords

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] 
            for doc in texts]

# remove stopwords

data_words_nostops = remove_stopwords(data_words)

print('Remove stopwords took %s' % str(datetime.datetime.now() - start))

count = sum([len(tweet) for tweet in data_words_nostops])
print('Total number of terms across all tweets is: ', count)

print(data_words_nostops[:3])

In [None]:
# Now create a document-term-matrix

# Create Dictionary
data_dict = Dictionary(data_words_nostops)
print("Length of initial dictionary is: ", (len(data_dict)))

data_dict.filter_extremes(no_below=5, no_above=0.2)
print("Length of reduced dictionary is: ", (len(data_dict)))

data_dict.filter_n_most_frequent(100)  
print("Length of reduced dictionary is: ", (len(data_dict)))

# Create Corpus
data_texts = data_words_nostops.copy()

# Term Document Frequency
corpus = [data_dict.doc2bow(text) for text in data_texts]

In [None]:
# Find low term documents and remove (tweets must have at least two terms)
# We want to do this as we want to relate tweets to each other, so need enough terms

# Convert corpus to a numpy matrix
numpy_matrix_eg = gensim.matutils.corpus2dense(corpus, num_terms=2212) # adjust based on length of dictionary

# Print word count per document
column_sums = [sum([row[i] for row in numpy_matrix_eg]) for i in range(0,len(numpy_matrix_eg[0]))]
counter=collections.Counter(column_sums)
od = collections.OrderedDict(sorted(counter.items()))

# Create a filtered matrix that removes docs with less than 2 terms
n_matrix_eg2 = numpy_matrix_eg[:, (numpy_matrix_eg != 0).sum(axis=0) > 1]

# Confirm that matrix has removed the docs
column_sums2 = [sum([row[i] for row in n_matrix_eg2]) for i in range(0,len(n_matrix_eg2[0]))]
counter2=collections.Counter(column_sums2)
od2 = collections.OrderedDict(sorted(counter2.items()))

# Convert matrix back to a corpus
corpus_eg = gensim.matutils.Dense2Corpus(n_matrix_eg2)

# Confirm change has been made
print(len(corpus)) # original corpus
print(len(corpus_eg)) # new corpus

# you don't want to lose more than about 10% of documents

In [None]:
# a basic LDA test model

# key choice is the number of topics that best describes the documents, which is a 'guess'

start = datetime.datetime.now()

lda10 = lda.LdaModel(corpus_eg, num_topics=10, id2word = data_dict, 
                     random_state=20, eval_every=None)


print('Run single LDA model took %s' % str(datetime.datetime.now() - start))

# Show Topics
pprint(lda10.show_topics(formatted=False))

# Compute Coherence Score
# a statistical estimate of whether a human reader would consider the topics to be 'coherent'
# varies between 0 (no coherence) and 1 (fully coherent)
coherence_model_lda10 = CoherenceModel(model=lda10, texts=data_texts, 
                                            dictionary=data_dict, coherence='c_v')
coherence_lda10 = coherence_model_lda10.get_coherence()

print('Calculate coherence score took %s' % str(datetime.datetime.now() - start))

print('\nCoherence Score: ', coherence_lda10)

In [20]:
# Finding the best LDA Model

start = datetime.datetime.now()

def compute_coherence_values(dictionary, corpus, texts, id2word, limit, start=2, step=2):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Dictionary
    id2word : Dictionary
    corpus : Corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number 
    of topics
    """
    alpha = [0.1, 1, 5, 10]
    random_state = [10, 15, 20]
    coherence_values = []
    model_list = []
    output_list = []
    for num_topics in range(start, limit, step):
        for a in alpha:
            for r in random_state:
                model = lda.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, 
                    random_state=r, alpha=a, eval_every=None)
                model_list.append(model)
                coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, 
                                                coherence='c_v')
                coherence_values.append(coherencemodel.get_coherence())
                model_output ={'alpha': a, 'random_state': r, 'coherence': 
                               coherencemodel.get_coherence(), 'num_topics': num_topics}
                output_list.append(model_output)

    return model_list, coherence_values, output_list

model_list, coherence_values, output_list = compute_coherence_values(dictionary=data_dict, 
                                                                     corpus=corpus_eg, 
                                                                     texts=data_texts,
                                                                     id2word=data_dict,
                                                                     start=6, limit=24, step=3)

print('Calculate multiple coherence scores took %s' % str(datetime.datetime.now() - start))

output_list_descending = sorted(output_list, key=lambda d: d['coherence'], reverse=True)
pprint(output_list_descending)

Calculate multiple coherence scores took 0:08:12.463445
[{'alpha': 1,
  'coherence': 0.5957766550086534,
  'num_topics': 21,
  'random_state': 20},
 {'alpha': 1,
  'coherence': 0.5929419545833838,
  'num_topics': 21,
  'random_state': 10},
 {'alpha': 0.1,
  'coherence': 0.5799671781774622,
  'num_topics': 15,
  'random_state': 10},
 {'alpha': 1,
  'coherence': 0.579236604533144,
  'num_topics': 21,
  'random_state': 15},
 {'alpha': 1,
  'coherence': 0.5783097427524253,
  'num_topics': 18,
  'random_state': 10},
 {'alpha': 5,
  'coherence': 0.5777347382355575,
  'num_topics': 21,
  'random_state': 10},
 {'alpha': 10,
  'coherence': 0.575042502052432,
  'num_topics': 21,
  'random_state': 10},
 {'alpha': 5,
  'coherence': 0.5729505828084405,
  'num_topics': 21,
  'random_state': 15},
 {'alpha': 5,
  'coherence': 0.5727892239830569,
  'num_topics': 6,
  'random_state': 20},
 {'alpha': 10,
  'coherence': 0.5727892239830569,
  'num_topics': 6,
  'random_state': 20},
 {'alpha': 1,
  'coheren

In [None]:
# best LDA model

start = datetime.datetime.now()

lda21 = lda.LdaModel(corpus_eg, num_topics=21, alpha=1, id2word = data_dict, 
                     random_state=20, eval_every=None)


print('Run best LDA model took %s' % str(datetime.datetime.now() - start))

# Show Topics
pprint(lda21.show_topics(formatted=False))

# Compute Coherence Score
# a statistical estimate of whether a human reader would consider the topics to be 'coherent'
# varies between 0 (no coherence) and 1 (fully coherent)
coherence_model_lda21 = CoherenceModel(model=lda21, texts=data_texts, 
                                            dictionary=data_dict, coherence='c_v')
coherence_lda21 = coherence_model_lda21.get_coherence()

print('Calculate coherence score took %s' % str(datetime.datetime.now() - start))

print('\nCoherence Score: ', coherence_lda21)

In [None]:
# Sentiment analysis

sent_analyzer = SentimentIntensityAnalyzer()


In [None]:
# Test the analyser

bayes = "ugh. This is too much machine learning in a single day. Please make it stop!!!"
print(sent_analyzer.polarity_scores(bayes))

In [None]:
# Turn our list of cleaned tweets back into a dataframe

sent_df = pd.DataFrame(data2)
sent_df.columns =['tweet_text']
sent_df.head()

In [None]:
# Apply sentiment to the dataset
def polarity(data):
  polarity = "neutral"
  if(data['compound']>= 0.05):
    polarity = "positive"
  elif(data['compound']<= -0.05):
    polarity = "negative"
  return polarity


def predict_sentiment(text):
  data =  sent_analyzer.polarity_scores(text)
  return polarity(data)


# Run the predictions
sent_df["sent_prediction"] = sent_df["tweet_text"].apply(predict_sentiment)

sent_df.head()

In [None]:
sent_df['sent_prediction'].value_counts()