In [None]:
# Let's read in our document-term matrix
import pandas as pd
import pickle
from gensim import matutils, models
import scipy.sparse

data = pd.read_pickle('/content/sample_data/dtm_stop.pkl')
data

Unnamed: 0,aa,ab,abdul,abdullah,abdullahalmahmud,abdur,abdus,abedin,abedinsamples,abh,abide,ability,abiotic,able,abm,abnormal,abnormalitythis,abnormally,abode,abolish,abort,abr,abroad,abroadfailing,absence,absolute,absolutely,abu,abul,abundance,abundant,abundantly,abuse,academic,academics,academy,accelerate,accelerated,accelerator,acceptable,...,yieldread,yields,yin,york,youll,young,younger,youngsters,younus,yousuf,youth,youths,yunus,zafar,zahid,zainul,zakaria,zakir,zambia,zero,zia,ziaul,zika,zimbabwe,zinc,zincenhanced,zincfortified,zobair,zone,zones,zonethe,zonewhile,zoo,zoogorai,zoological,zoology,zoom,zoos,zootaxaaccording,zusammenarbeit
Environment,0,1,13,5,1,1,7,3,1,1,1,1,1,8,3,1,1,0,1,1,2,2,1,0,2,1,2,5,2,1,8,1,4,1,1,1,1,1,0,2,...,1,2,1,7,1,12,1,0,1,1,6,3,1,1,0,1,1,1,0,3,1,1,0,0,2,0,0,1,8,8,1,1,16,1,1,5,0,3,1,0
Science,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Business,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,1,0,0,0,0,0,1,3,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,...,0,4,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,2,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1


In [None]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,Environment,Science,Business
aa,0,3,0
ab,1,0,0
abdul,13,0,0
abdullah,5,0,0
abdullahalmahmud,1,0,0


In [None]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [None]:
cv = pickle.load(open("/content/sample_data/cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
id2word

In [None]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.006*"forest" + 0.005*"environment" + 0.004*"sundarbans" + 0.004*"doe" + 0.003*"water" + 0.003*"people" + 0.003*"fish" + 0.003*"river" + 0.003*"change" + 0.003*"lakh"'),
 (1,
  '0.002*"research" + 0.002*"nuclear" + 0.002*"plant" + 0.002*"port" + 0.002*"mongla" + 0.002*"dengue" + 0.001*"award" + 0.001*"student" + 0.001*"won" + 0.001*"university"')]

In [None]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.007*"cent" + 0.004*"according" + 0.004*"bank" + 0.004*"year" + 0.004*"new" + 0.003*"countrys" + 0.003*"mechanisation" + 0.003*"workers" + 0.003*"banks" + 0.002*"women"'),
 (1,
  '0.007*"forest" + 0.006*"environment" + 0.005*"sundarbans" + 0.004*"doe" + 0.004*"water" + 0.003*"people" + 0.003*"fish" + 0.003*"river" + 0.003*"change" + 0.003*"energy"'),
 (2,
  '0.001*"forest" + 0.001*"environment" + 0.000*"doe" + 0.000*"sundarbans" + 0.000*"today" + 0.000*"fish" + 0.000*"change" + 0.000*"river" + 0.000*"water" + 0.000*"land"')]

In [None]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.008*"cent" + 0.004*"bank" + 0.004*"countrys" + 0.004*"new" + 0.004*"year" + 0.004*"according" + 0.004*"mechanisation" + 0.003*"workers" + 0.003*"banks" + 0.003*"women"'),
 (1,
  '0.000*"forest" + 0.000*"environment" + 0.000*"doe" + 0.000*"river" + 0.000*"sundarbans" + 0.000*"land" + 0.000*"energy" + 0.000*"people" + 0.000*"water" + 0.000*"fish"'),
 (2,
  '0.003*"research" + 0.003*"nuclear" + 0.003*"plant" + 0.003*"port" + 0.003*"mongla" + 0.003*"dengue" + 0.002*"award" + 0.002*"student" + 0.002*"won" + 0.002*"university"'),
 (3,
  '0.008*"forest" + 0.006*"environment" + 0.005*"sundarbans" + 0.004*"doe" + 0.004*"water" + 0.003*"people" + 0.003*"fish" + 0.003*"river" + 0.003*"change" + 0.003*"energy"')]

#Attempt2 (Nouns Only)

In [None]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [None]:
import nltk
nltk.download('all')

In [None]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('/content/sample_data/clean_data.pkl')
data_clean

Unnamed: 0,Headlines
Environment,govt bans fishing in the bay of bengal for da...
Science,rooppur power plant unloading of key machines ...
Business,bondhu chula for healthy living bangladesh bon...


In [None]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.Headlines.apply(nouns))
data_nouns

Unnamed: 0,Headlines
Environment,govt bans bay bengal days government ban bay g...
Science,rooppur power plant unloading machines begins ...
Business,bondhu chula living bondhu foundation organisa...


In [None]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['said','bangladesh','climate','department','government','tk','country','power']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.Headlines)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,aa,ab,abdul,abdullah,abdullahalmahmud,abdur,abdus,abedin,abedinsamples,ability,abm,abode,absence,absolute,abu,abundance,abuse,academics,academy,accelerator,acceptance,access,accessibility,accessories,accolades,accord,account,accountability,accountants,accounts,accumulation,accusations,acf,achhia,achievement,aci,acids,acis,acps,acquisition,...,yearsthis,yearthe,yearus,yee,yeo,yesterday,yesterdaydemand,yesterdaynazim,yesterdaynews,yesterdayrafihath,yield,yieldread,yields,york,youll,youngsters,younus,youth,youths,yunus,zafar,zahid,zainul,zambia,zero,zia,ziaul,zimbabwe,zinc,zincenhanced,zobair,zone,zones,zoo,zoogorai,zoology,zoom,zoos,zootaxaaccording,zusammenarbeit
Environment,0,1,7,5,1,1,2,3,1,1,1,1,2,1,3,1,3,1,1,0,0,15,2,1,1,1,2,4,1,3,1,1,1,1,0,0,4,0,0,2,...,1,2,1,1,1,15,1,1,1,0,1,1,2,7,1,0,1,6,3,1,1,0,1,0,1,1,1,0,2,0,1,7,8,16,1,5,0,3,1,0
Science,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Business,0,0,0,0,0,0,0,0,0,1,0,0,3,0,0,0,0,0,0,0,1,5,0,0,0,0,2,1,0,0,0,0,0,0,0,9,0,1,3,0,...,0,0,0,0,0,13,0,0,0,0,2,0,4,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,2,1,0,0,0,0,0,0,1,0,0,1


In [None]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [None]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.009*"environment" + 0.008*"sundarbans" + 0.007*"water" + 0.006*"people" + 0.006*"doe" + 0.005*"energy" + 0.005*"river" + 0.005*"pollution" + 0.005*"change" + 0.005*"land"'),
 (1,
  '0.010*"cent" + 0.005*"bank" + 0.005*"year" + 0.005*"countrys" + 0.004*"mechanisation" + 0.004*"workers" + 0.004*"banks" + 0.004*"women" + 0.003*"fund" + 0.003*"research"')]

In [None]:
# Let's start with 3 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.013*"cent" + 0.007*"bank" + 0.006*"year" + 0.006*"countrys" + 0.006*"mechanisation" + 0.005*"workers" + 0.005*"banks" + 0.005*"women" + 0.004*"fund" + 0.004*"factories"'),
 (1,
  '0.010*"environment" + 0.008*"sundarbans" + 0.007*"water" + 0.006*"people" + 0.006*"doe" + 0.005*"river" + 0.005*"energy" + 0.005*"pollution" + 0.005*"change" + 0.005*"land"'),
 (2,
  '0.005*"research" + 0.004*"plant" + 0.004*"dengue" + 0.003*"student" + 0.003*"university" + 0.003*"port" + 0.003*"award" + 0.003*"robot" + 0.003*"scientists" + 0.003*"reactor"')]



*  Topic 0 : Business

*   Topic 1: Environment
*  Topic 2 : Science




