In [None]:
import pandas as pd
import numpy as np
from string import printable
import re 
import random
import string
import datetime
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from pprint import pprint
import pickle
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation as LDA
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import os
from gensim.models.wrappers import LdaMallet

In [None]:
data = pd.read_csv("../Twitterdata.csv")

In [None]:
data1=data[(data["tweetcreatedts"]>="2020-03-24")]
data1["date"]=pd.to_datetime(data1["tweetcreatedts"], format='%Y-%m-%d %H:%M:%S')
data1["date"]=data1["date"].dt.date

In [None]:
#removing handle names
data1["text_clean"] = data1['text'].replace('@[^\s]+', '', regex=True)
#remove non-ASCII characters
st = set(printable)
data1["text_clean"] = data1["text_clean"].apply(lambda x: ''.join([" " if  i not in  st else i for i in x]))
#removing urls
data1['text_clean'] = data1['text_clean'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
#converting to no caps
data1["text_clean"] = data1["text_clean"].str.lower()

In [None]:
tweet_combined = ' '.join(list(data1['text_clean'].values))
list_hashtag=[i for i in tweet_combined.split() if i.startswith('#')]
final_ht=Counter(list_hashtag)

In [None]:
#get the top 25 most tweeted hashtags
df = pd.DataFrame.from_dict(final_ht, orient='index').reset_index()
df.columns = ['hashtag', 'count']
df=df[df.hashtag!='#']
df=df.sort_values(by=['count'], ascending=False)
hashtag=df[:25]

In [None]:
#removing non-alphabet characters except space - replacing non-alphabet with a space so that words don't get combined
data1["text_clean"] = data1['text_clean'].replace('[^a-zA-Z1-9 ]', ' ', regex=True)
#getting ready to remove stop words
stop_words=stopwords.words('english')

In [None]:
#removing stopwords
data1["text_clean"] = data1["text_clean"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
data_check = data1[data1.text_clean != '']

In [None]:
#some cleaning of words
def sent_to_words(sentences):
    for sent in sentences:
        #sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        #sent = re.sub('\s+', ' ', sent)  # remove newline chars
        #print(sent)
        sent = str.replace(sent, 'domesticviolence', 'domestic violence')
        sent = str.replace(sent, 'domesticabuse', 'domestic abuse')
        sent = str.replace(sent, 'intimatepartnerviolence', 'intimate partner violence')
        sent = str.replace(sent, 'sexualviolence', 'sexual violence')
        sent = str.replace(sent, 'genderbasedviolence', 'gender based violence')
        sent = str.replace(sent, 'migrantworker', 'migrant worker')
        sent = str.replace(sent, 'policebrutality', 'police brutality')
        sent = str.replace(sent, 'casteviolence', 'caste violence')
        sent = str.replace(sent, 'genderdata', 'gender data')
        sent = str.replace(sent, 'domesticwork', 'domestic work')
        sent = str.replace(sent, 'unpaidlabour', 'unpaid labour')
        sent = str.replace(sent, 'unpaidlabor', 'unpaid labour')
        sent = str.replace(sent, 'unpaidwork', 'unpaid work')
        sent = str.replace(sent, 'labor', 'labour')
        sent = str.replace(sent, 'coronaviruspandemic', 'coronavirus pandemic')
        sent = str.replace(sent, 'coronaviruslockdown', 'coronavirus lockdown')
        sent = str.replace(sent, 'coronalockdown', 'coronavirus lockdown')
        sent = str.replace(sent, 'covidlockdown', 'covid lockdown')
        sent = str.replace(sent, 'coronavirus', 'covid')
        sent = str.replace(sent, 'corona', 'covid')
        sent = str.replace(sent, 'covid19', 'covid')
        sent = str.replace(sent, 'covid 19', 'covid')
        sent = str.replace(sent, 'indian', 'india')
        sent = str.replace(sent, 'indians', 'india')
        sent = str.replace(sent, 'violenceagainstwomen', 'violence against women')
        sent = str.replace(sent, 'pulitz', ' pulitz')
        sent = str.replace(sent, 'kashmir', ' kashmir ')
        sent = str.replace(sent, 'kashmiri', ' kashmiri ')
        #print(sent)
        yield(sent) 

In [None]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text))


def process_words(texts, stop_words=stop_words):
    """Form Bigrams, and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc), max_len=30) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts_out = []
    #nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        sent=str(sent)
        result=[]
        for token in gensim.utils.simple_preprocess(sent, min_len=3, max_len=30):
            result.append(lemmatize_stemming(token))
        texts_out.append(result)
    return texts_out

In [None]:
# Convert to list
data2 = data_check.text_clean.values.tolist() 

c = Counter(word for x in data2 for word in x.split())
texts = [' '.join(y for y in x.split() if c[y] > 1) for x in data2]
data_words = list(sent_to_words(texts))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=20, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [None]:
data_ready = process_words(data_words)  
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
id2word.filter_extremes(no_below=15, no_above=0.8)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

In [None]:
#os.environ.update({'MALLET_HOME':r'C:\\mallet\\'}) 
os.environ['MALLET_HOME'] = 'C:\\mallet'
mallet_path = r'C:\\mallet\\bin\\mallet.bat' 

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word, iterations = 300, random_seed=12345)
        model_list.append(model)
        coherence_model_ldamallet = CoherenceModel(model=model, texts=data_ready, dictionary=id2word, coherence='c_v')
        coherence_values.append(coherence_model_ldamallet.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_ready, limit=40, start=10, step=1)
# Show graph
limit=40; start=10; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.legend(("coherence_values"), loc='best')

In [None]:
ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=18, id2word=id2word, iterations = 300, random_seed=12345)

In [None]:
coherence = CoherenceModel(model=ldamallet, texts=data_ready, dictionary=id2word, coherence='c_v')
print(coherence.get_coherence())

In [None]:
pprint(ldamallet.show_topics(formatted=False, num_words=30))

In [None]:
pickle.dump(ldamallet, open( "../lda.pkl", "wb" ) )

In [None]:
def format_topics_sentences(ldamallet, corpus=corpus, texts=data_ready):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamallet[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            
            if j == 0:  # => dominant topic
                wp = ldamallet.show_topic(topic_num, num_words=30)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamallet=ldamallet, corpus=corpus, texts=data_ready)

In [None]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [None]:
a=df_dominant_topic["Dominant_Topic"].to_numpy().tolist()
b=df_dominant_topic["Topic_Perc_Contrib"].to_numpy().tolist()
c=df_dominant_topic["Keywords"].to_numpy().tolist()
d=data_check["text"].to_numpy().tolist()
e=data_check["likescount"].to_numpy().tolist()
f=data_check["retweetcount"].to_numpy().tolist()
g=data_check["username"].to_numpy().tolist()
h=data_check["date"].to_numpy().tolist()


In [None]:
data_tuples = list(zip(a,b,c,d,e,f, g, h))

In [None]:
df_dominant_topic = pd.DataFrame(data_tuples, columns=['Dominant_Topic','Topic_Perc_Contrib', 'Keywords', 'text','likescount','retweetcount', 'username', 'date'])

In [None]:
df_dominant_topic.to_csv("../topicsandtweets.csv")