In [35]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import random
import nltk
import re
import string
from bs4 import BeautifulSoup

In [60]:
data = pd.read_csv('AfterRona.csv')
data.columns

Index(['Date', 'Text', 'Unnamed: 2', 'Unnamed: 3'], dtype='object')

In [61]:
data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3'])
data.head()

Unnamed: 0,Date,Text
0,2020-05-29 05:29:40,"b'Today is tomorrow\xe2\x80\x99s history, wond..."
1,2020-05-29 01:29:43,"b""Can't wait to see other tech that'll come up..."
2,2020-05-28 23:07:14,"b""RT @Silvia_Wangeci: A woman shows her middle..."
3,2020-05-28 21:05:02,b'Pigs freak me out but I\xe2\x80\x99m still d...
4,2020-05-28 20:29:12,b'RT @Bernardkiprop3: #KOT Help RT this until ...


In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2641 entries, 0 to 2640
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    2641 non-null   object
 1   Text    2641 non-null   object
dtypes: object(2)
memory usage: 41.4+ KB


Data Preprocessing

In [63]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [64]:
space_replace = re.compile('[/(){}\[\]\|@,;]')
bad_symbols = re.compile('[^0-9a-z #+_]')
stopwords = ['brt', 'rt']
urls = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' 'rt')
usernames = re.compile("@[A-Za-z0-9]+")
def text_cleaning(text):
    text = usernames.sub(' ', text)#removing usernames
    text = BeautifulSoup(text, "lxml").text #removing any html decoding
    text = text.lower() #removing capitalization
    text = space_replace.sub(' ', text)#replacing symbols with a space
    text = bad_symbols.sub('', text) #deleting symbols from the text
    text = ' '.join(word for word in text.split() if word not in stopwords) #removing stopwords
    text = urls.sub('', text)#removing urls
    text  = "".join([char for char in text if char not in string.punctuation])
    return text
data['cleaned_text'] = data['Text'].apply(text_cleaning)

In [65]:
data.head()

Unnamed: 0,Date,Text,cleaned_text
0,2020-05-29 05:29:40,"b'Today is tomorrow\xe2\x80\x99s history, wond...",btoday is tomorrowxe2x80x99s history wondering...
1,2020-05-29 01:29:43,"b""Can't wait to see other tech that'll come up...",bcant wait to see other tech thatll come up af...
2,2020-05-28 23:07:14,"b""RT @Silvia_Wangeci: A woman shows her middle...",wangeci a woman shows her middle finger to tru...
3,2020-05-28 21:05:02,b'Pigs freak me out but I\xe2\x80\x99m still d...,bpigs freak me out but ixe2x80x99m still down ...
4,2020-05-28 20:29:12,b'RT @Bernardkiprop3: #KOT Help RT this until ...,kot help this until recognises this beautiful ...


# CREATING A VOCUBLARY FROM THE DATA USING COUNT VECTORIZER

In [66]:
count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
term_matrix =  count_vect.fit_transform(data['cleaned_text'].values.astype('U')) #including words that occur less than 80% of the time in the document
'''stop words have also been removed since they barely contribute significantly to the vocabulary'''
term_matrix

<2641x1307 sparse matrix of type '<class 'numpy.int64'>'
	with 25034 stored elements in Compressed Sparse Row format>

#every word in the document is represented by a 1307 dimensional vector. i.e we have a vocabulary of 1307 words


In [69]:
#we now use LDA to create topics based on the probability of each word in the document
lda = LatentDirichletAllocation(n_components=5, random_state=42) #we set n = 5 as our initial guess of topics in the data
lda.fit(term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [70]:
#top 50 words in the vocubulary
for i in range(51):
    random_word = random.randint(0, len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_word])

despite
spread
girlnafterrona
delegationxf0x9fx92xa5xf0x9fx92xa5xf0x9fx92xa5nnpogbabrunomarcus
bme
start
officers
washing
ntiktok
bafter
club
son
globalsundaysexe2x80xa6
better
nini
breathtaking
niggas
forge
startxe2x80xa6
weaknessxe2x80xa6
reasons
retweet
virus
xe2x99xa5xefxb8x8f
mother
stayed
ounce
4songs
coward
cancel
account
issue
isolationxf0x9fx98x82xf0x9fx98x82xf0x9fx98x82nafterronaneidmubarak
joy
food
photos
acting
churchillshow
amp
deapstate
cocacola
really
rice
strength
neidmubarak
qmbbvkm4re
imagine
gonna
afterronannthis
great
metres


In [73]:
#displaying the first topic
first_topic = lda.components_[0]
first_topic# the output is a vector. from the vector we can then obtain the words from the count_vectorizer feature

array([ 0.20002213,  0.20069079,  2.19997897, ...,  2.19998035,
        5.19998942, 10.19999216])

In [74]:
#obtaining the top words in the first topic
top_topic_words = first_topic.argsort()[-10:]
for i in top_topic_words:
    print(count_vect.get_feature_names()[i])

amp
people
retweet
like
kenya
miguna
bafterrona
afterrona
https
tco


In [77]:
#displaying the top 20 words in each of the topics
for i,topic in enumerate(lda.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 10 words for topic #0:
['monitor', 'donxe2x80x99t', 'just', 'owino', 'timelinenni', 'lotxf0x9fx99x8fxf0x9fx8fxbfnmy', 'client', 'efficient', 'means', 'offer', 'amp', 'people', 'retweet', 'like', 'kenya', 'miguna', 'bafterrona', 'afterrona', 'https', 'tco']


Top 10 words for topic #1:
['trend', 'little', 'let', 'girl', 'cocacola', 'make', 'herxe2x80xa6', 'quenching', 'ifikie', 'recognises', 'help', 'photo', 'kot', 'kenyan', 'beautiful', 'child', 'young', 'afterrona', 'tco', 'https']


Top 10 words for topic #2:
['nkot', 'young', 'face', 'kid', 'hey', 'retweet', 'deserves', 'baringo', 'possesses', 'research', 'researchers', 'intelligence', 'stones', 'thanxe2x80xa6', 'bucket', 'institute', 'lulli', 'medical', 'feeling', 'kenya']


Top 10 words for topic #3:
['nthe', 'park', 'number', 'places', 'bexe2x80xa6', 'months', 'future', 'place', 'know', 'trees', 'motorcadenrutowantedtokillxe2x80xa6', 'wangeci', 'trumps', 'middle', 'motorcade', 'uhurus', 'finger', 'woman', 'shows', 'afterrona'

# TOPIC TRACKING USING NON-NEGATIVE MATRIX FACTORIZATION

In [78]:
tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
term_matrix_tfidf = tfidf_vect.fit_transform(data['cleaned_text'].values.astype('U'))

In [79]:
term_matrix_tfidf

<2641x1307 sparse matrix of type '<class 'numpy.float64'>'
	with 25034 stored elements in Compressed Sparse Row format>

In [80]:
nmf = NMF(n_components=5, random_state=42)
nmf.fit(term_matrix_tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [81]:
#obtaining random words from our new vocab
import random

for i in range(10):
    random_id = random.randint(0,len(tfidf_vect.get_feature_names()))
    print(tfidf_vect.get_feature_names()[random_id])

nduale
hands
nexe2x80xa6
possesses
streets
mburru
xe2x99xa5xefxb8x8f
piece
end
oil


In [83]:
#top words in the first topic
first_topic = nmf.components_[0]
top_topic_words = first_topic.argsort()[-20:]
for i in top_topic_words:
    print(tfidf_vect.get_feature_names()[i])

retweet
kid
thirst
nkot
tasting
cocacolaxe2x80xa6
face
deserves
feeling
thxe2x80xa6
young
quenching
help
herxe2x80xa6
kenyan
child
beautiful
kot
photo
recognises


In [85]:
#obtaining top 20 words for each of the topics
for i,topic in enumerate(nmf.components_):
    print(f'Top 20 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 20 words for topic #0:
['retweet', 'kid', 'thirst', 'nkot', 'tasting', 'cocacolaxe2x80xa6', 'face', 'deserves', 'feeling', 'thxe2x80xa6', 'young', 'quenching', 'help', 'herxe2x80xa6', 'kenyan', 'child', 'beautiful', 'kot', 'photo', 'recognises']


Top 20 words for topic #1:
['entire', 'amazing', 'sees', 'lets', 'invest', 'lessons', 'qmtft1tcj5', 'covid19', 'know', 'https', 'tco', 'afterrona', 'ifikie', 'make', 'cocacola', 'trend', 'girl', 'let', 'little', 'aunouwdicb']


Top 20 words for topic #2:
['lessons', 'qmtft1tcj5', 'going', 'finish', 'hillary', 'xf0x9fx8exa5xe2x80xa6', 'nmigunanafterrona', 'decides', 'courtesy', 'nvideo', 'afterrona', 'woman', 'shows', 'motorcadenrutowantedtokillxe2x80xa6', 'wangeci', 'motorcade', 'finger', 'middle', 'trumps', 'uhurus']


Top 20 words for topic #3:
['lessons', 'qmtft1tcj5', 'covid19', 'nvideo', 'decides', 'xf0x9fx8exa5xe2x80xa6', 'nmigunanafterrona', 'courtesy', 'hillary', 'finish', 'going', 'https', 'tco', 'ifikie', 'nrentakikuyu', 'ljl82m

The purpose of this notebook was to attempt to find a means of identifying the main topics under a given twitter hashtag. 
I decided to use the #AfterROna hashtag for this exercise, as well as text clustering using SKlearn's Latent DirichletAllocation and Non-negative matrix factorization. 
The results indicate that local Kenyan politics and the Baringo girl were the most discussed topics under the hashtag. 
The main challenge experienced in this exercise was to properly prepare the dataset. Various unreadable words were captured as the top words and this indicates that a significant portion of the data was not clean. 
This will be improved on in future exercises. 