In [None]:
#Imports:

import pandas as pd
import unicodedata
import re
import contractions
import string
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
#spacy
import spacy
from nltk.corpus import stopwords
#vis
import pyLDAvis
import pyLDAvis.gensim_models

In [None]:
# Data Loading

def load_data(path):
    return pd.read_csv(path)
tweets_df=load_data(path)

In [None]:
# Data Cleaning and Processing
def to_lowercase(text):
    return text.lower()
#converting every row of the column into lower case 
tweets_df.Tweets=tweets_df.Tweets.apply(to_lowercase)

In [None]:
# Standardizing Accent Characters
def standardize_accented_chars(text):
 return unicodedata.normalize(‘NFKD’, text).encode(‘ascii’, ‘ignore’).decode(‘utf-8’, ‘ignore’)
#testing the function on a single sample for explaination
print(standardize_accented_chars('Sómě words such as résumé, café, prótest, divorcé, coördinate, exposé, latté.'))
#standardizing accented characters for every row
tweets_df.Tweets=tweets_df.Tweets.apply(standardize_accented_chars)

In [None]:
# Removing URLs
def get_number_of_urls(documents):
    print("{:.2f}% of documents contain urls".format(sum
(documents.apply(lambda x:x.find('http'))>0)/len
(documents)*100))
# Passing the 'Tweets' column of the dataframe as the argument
print(get_number_of_urls(tweets_df.Tweets)) 

In [None]:
def remove_url(text):
 return re.sub(r’https?:\S*’, ‘’, text)
#testing the function on a single sample for explaination
print(remove_url('using https://www.google.com/ as an example'))
#removing urls from every row
tweets_df.Tweets=tweets_df.Tweets.apply(remove_url)

In [None]:
# Expanding Contractions
def expand_contractions(text):
    expanded_words = [] 
    for word in text.split():
       expanded_words.append(contractions.fix(word)) 
    return ‘ '.join(expanded_words)
#testing the function on a single sample for explaination
print(expand_contractions("Don't is same as do not"))
#expanding contractions for every row
tweets_df.Tweets=tweets_df.Tweets.apply(expand_contractions)

In [None]:
# Removing Mentions and Hashtags
def remove_mentions_and_tags(text):
    text = re.sub(r’@\S*’, ‘’, text)
    return re.sub(r’#\S*’, ‘’, text)
#testing the function on a single sample for explaination
print(remove_mentions_and_tags('Some random @abc and #def'))
#removing mentions and tags from every row
tweets_df.Tweets=tweets_df.Tweets.apply(remove_tags)


In [None]:
# Keeping only Alphabet
def keep_only_alphabet(text):
    return re.sub(r’[^a-z]’, ‘ ‘, text)
#testing the function on a single sample for explaination
print(keep_only_alphabet('Just a bit more $$processing required.Just a bit!!!'))
#for all the rows
tweets_df.Tweets=tweets_df.Tweets.apply(keep_only_alphabet)


In [None]:
# Removing Stopwords(Default+Custom) and Removing Short Words

def remove_stopwords(text,nlp,custom_stop_words=None,
remove_small_tokens=True,min_len=2):
    # if custom stop words are provided, then add them to default stop words list
    if custom_stop_words:
        nlp.Defaults.stop_words |= custom_stop_words
    
    filtered_sentence =[] 
    doc=nlp(text)
    for token in doc:
        
        if token.is_stop == False: 
            
            # if small tokens have to be removed, then select only those which are longer than the min_len 
            if remove_small_tokens:
                if len(token.text)>min_len:
                    filtered_sentence.append(token.text)
            else:
                filtered_sentence.append(token.text)
    # if after the stop word removal, words are still left in the sentence, then return the sentence as a string else return null 
    return “ “.join(filtered_sentence) if len(filtered_sentence)>0 else None
#creating a spaCy object. 
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
#removing stop-words and short words from every row
tweets_df.Tweets=tweets_df.Tweets.apply(lambda x:remove_stopwords(x,nlp,{"elon","musk",}))

In [None]:
# Lemmatization
def lemmatize(text, nlp):
   doc = nlp(text)
   lemmatized_text = []
   for token in doc:
   lemmatized_text.append(token.lemma_)
   return “ “.join(lemmatized_text)
#testing the function on a single sample for explaination
print(lemmatize('Reading NLP blog is fun.' ,nlp ))
#Performing lemmatization on every row
tweets_df.Tweets=tweets_df.Tweets.apply(lambda x:lemmatize(x,nlp))

In [None]:
# Generating Document Matrix and Dictionary
def generate_tokens(tweet):
    words=[]
    for word in tweet.split(‘ ‘):
    # using the if condition because we introduced extra spaces during text cleaning
    if word!=’’:
       words.append(word)
    return words
#storing the generated tokens in a new column named 'words'
tweets_df['tokens']=tweets_df.Tweets.apply(generate_tokens)

In [None]:
# 

def create_dictionary(words):
    return corpora.Dictionary(words)
#passing the dataframe column having tokens as the argument
id2word=create_dictionary(tweets_df.tokens)
print(id2word)

In [None]:
# document matrix
def create_document_matrix(tokens,id2word):
    corpus = []
    for text in tokens:
       corpus.append(id2word.doc2bow(text))
 return corpus
#passing the dataframe column having tokens and dictionary
corpus=create_document_matrix(tweets_df.tokens,id2word)
print(tweets_df.tokens[0])
print(corpus[0])

In [None]:
# Implementing LDA
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
 id2word=id2word,
 num_topics=10,
 random_state=100,
 )

In [None]:
# Generating LDA Topics
def get_lda_topics(model, num_topics, top_n_words):
     word_dict = {}
     for i in range(num_topics):
         word_dict[‘Topic # ‘ + ‘{:02d}’.format(i+1)] = [i[0] for i in model.show_topic(i, topn = top_n_words)];
 
     return pd.DataFrame(word_dict)
get_lda_topics(lda_model,10,10)

In [None]:
# Visualizing Topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds=”mmds”, R=30)
vis

In [None]:
#https://medium.com/mlearning-ai/topic-modelling-with-lda-on-the-tweets-mentioning-elon-musk-687076a2c86b

In [1]:
#BERT Better one thanjust 

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model)

topics, probs = topic_model.fit_transform(content)
topics, probs = topic_model.fit_transform(content)
topic_model.get_representative_docs(0)
topic_model.visualize_barchart()

In [None]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(content)
topics, probs = topic_model.fit_transform(content)
topic_model.get_representative_docs(0)
topic_model.visualize_barchart() 

In [None]:
# lda
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(papers['preprocessed_text'])

number_topics = 5

lda = LDA(n_components=number_topics)
lda.fit(count_data)


In [None]:
# lsi Latent Semantic Analysis

from gensim import corpora
from gensim.models import LsiModel

def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics)
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

number_of_topics=6
words=10
document_list,titles=load_data("","corpus.txt")
model=create_gensim_lsa_model(clean_text,number_of_topics,words)


In [None]:
# Non Negative Matrix Factorization
from sklearn.feature_extraction.text import TfidfVectorizer
 
from sklearn.decomposition import NMF

vectorizer = TfidfVectorizer(max_features=2000, min_df=10, stop_words='english')
 
vectorized_data = vectorizer.fit_transform(data)

nmf = NMF(n_components=20, solver="mu")
 
W = nmf.fit_transform(vectorized_data)
