In [1]:
import pandas as pd
import pickle

data = pd.read_pickle('pickles/ignitisdtm_stop.pkl')
data

Unnamed: 0,ab,abbr,abbreviation,ability,able,abovementioned,abroad,accelerated,accepted,access,...,year,yearly,years,yes,yield,young,yoy,zero,zone,zones
ignitis2020_q1_en_eur_con_ias.txt,52,2,1,0,1,0,2,1,1,1,...,36,0,14,0,2,1,0,0,1,2
ignitis2020_q2_en_eur_con_ias.txt,134,0,1,2,3,0,0,0,0,1,...,55,0,26,100,2,4,2,0,1,0
ignitis2020_q2_en_eur_con_ias_00.txt,133,2,1,0,6,1,1,0,1,1,...,122,0,29,100,2,4,2,0,2,15
ignitis2020_q3_en_eur_con_ias.txt,185,2,0,0,4,1,1,0,3,0,...,70,2,29,0,2,1,4,1,1,14


In [2]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse


In [3]:
tdm = data.transpose()
tdm.head()

Unnamed: 0,ignitis2020_q1_en_eur_con_ias.txt,ignitis2020_q2_en_eur_con_ias.txt,ignitis2020_q2_en_eur_con_ias_00.txt,ignitis2020_q3_en_eur_con_ias.txt
ab,52,134,133,185
abbr,2,0,2,2
abbreviation,1,1,1,0
ability,0,2,0,0
able,1,3,6,4


In [4]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [5]:
cv = pickle.load(open("pickles/ignitiscv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [6]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.009*"june" + 0.007*"ab" + 0.007*"yes" + 0.006*"board" + 0.005*"shares" + 0.004*"code" + 0.004*"total" + 0.004*"companys" + 0.004*"supervisory" + 0.004*"energy"'),
 (1,
  '0.007*"board" + 0.006*"ab" + 0.006*"companys" + 0.005*"non" + 0.005*"income" + 0.005*"total" + 0.005*"customers" + 0.005*"revenue" + 0.005*"generation" + 0.005*"expenses"')]

In [7]:
# LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.008*"september" + 0.007*"ab" + 0.007*"companys" + 0.006*"months" + 0.005*"consolidated" + 0.005*"expenses" + 0.005*"non" + 0.005*"income" + 0.005*"https" + 0.005*"shares"'),
 (1,
  '0.009*"board" + 0.006*"june" + 0.006*"ab" + 0.005*"report" + 0.005*"generation" + 0.005*"total" + 0.005*"customers" + 0.005*"companys" + 0.005*"non" + 0.005*"energy"'),
 (2,
  '0.001*"board" + 0.001*"related" + 0.001*"ab" + 0.001*"companys" + 0.001*"customers" + 0.001*"june" + 0.001*"shares" + 0.001*"ebitda" + 0.001*"expenses" + 0.001*"energy"')]

In [8]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.010*"september" + 0.008*"companys" + 0.008*"ab" + 0.006*"months" + 0.006*"https" + 0.005*"non" + 0.005*"shares" + 0.005*"income" + 0.005*"en" + 0.005*"expenses"'),
 (1,
  '0.009*"board" + 0.006*"revenue" + 0.006*"generation" + 0.006*"companys" + 0.005*"customers" + 0.005*"apm" + 0.005*"non" + 0.005*"income" + 0.005*"march" + 0.005*"member"'),
 (2,
  '0.008*"board" + 0.008*"june" + 0.007*"ab" + 0.005*"total" + 0.005*"energy" + 0.005*"report" + 0.005*"yes" + 0.005*"shares" + 0.005*"customers" + 0.005*"activities"'),
 (3,
  '0.002*"board" + 0.002*"ab" + 0.001*"companys" + 0.001*"customers" + 0.001*"total" + 0.001*"june" + 0.001*"business" + 0.001*"generation" + 0.001*"revenue" + 0.001*"member"')]

In [9]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [10]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('pickles/ignitisdata_clean.pkl')
data_clean

Unnamed: 0,text
ignitis2020_q1_en_eur_con_ias.txt,interim report consolidated interim report for...
ignitis2020_q2_en_eur_con_ias.txt,interim report consolidated interim report for...
ignitis2020_q2_en_eur_con_ias_00.txt,interim report consolidated interim report for...
ignitis2020_q3_en_eur_con_ias.txt,interim report first nine months consolidated ...


In [11]:

# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.text.apply(nouns))
data_nouns

Unnamed: 0,text
ignitis2020_q1_en_eur_con_ias.txt,report report quarter statements period march ...
ignitis2020_q2_en_eur_con_ias.txt,report report year statements period june acco...
ignitis2020_q2_en_eur_con_ias_00.txt,report report year statements period june acco...
ignitis2020_q3_en_eur_con_ias.txt,report months report statements period accorda...


In [16]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['www','interim','report','statements','period','ab','abbr','yes','žvejų','žaneta','žilvista','živilė','šalys','šarūnas','žukauskas','žūb','įmonės','świnoujście','šiaulių']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.text)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,abbreviation,ability,access,accident,accidents,accompanying,accordance,account,accountability,accountants,...,workforce,working,works,world,worth,year,years,yield,zone,zones
ignitis2020_q1_en_eur_con_ias.txt,1,0,1,1,1,1,14,7,0,0,...,0,1,3,4,0,36,14,1,1,2
ignitis2020_q2_en_eur_con_ias.txt,1,2,1,0,0,1,23,5,0,2,...,4,1,4,4,1,55,26,1,1,0
ignitis2020_q2_en_eur_con_ias_00.txt,1,0,1,2,0,1,22,5,0,0,...,4,1,4,4,1,122,29,1,2,14
ignitis2020_q3_en_eur_con_ias.txt,0,0,0,2,0,1,21,12,1,0,...,2,2,5,3,0,70,29,1,1,13


In [17]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [18]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.023*"company" + 0.020*"group" + 0.016*"electricity" + 0.015*"ignitis" + 0.010*"gas" + 0.010*"board" + 0.010*"eur" + 0.009*"management" + 0.008*"assets" + 0.008*"lithuania"'),
 (1,
  '0.009*"company" + 0.006*"group" + 0.005*"electricity" + 0.003*"management" + 0.003*"gas" + 0.003*"board" + 0.003*"eur" + 0.003*"ignitis" + 0.003*"energy" + 0.003*"member"')]

In [19]:
# Let's try topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.017*"company" + 0.015*"group" + 0.011*"electricity" + 0.007*"gas" + 0.006*"ignitis" + 0.006*"vilnius" + 0.005*"assets" + 0.005*"generation" + 0.005*"eur" + 0.005*"board"'),
 (1,
  '0.023*"company" + 0.020*"group" + 0.016*"electricity" + 0.015*"ignitis" + 0.010*"gas" + 0.010*"board" + 0.010*"eur" + 0.009*"management" + 0.008*"assets" + 0.008*"lithuania"'),
 (2,
  '0.003*"company" + 0.003*"electricity" + 0.003*"group" + 0.002*"ignitis" + 0.002*"board" + 0.002*"lithuania" + 0.002*"eur" + 0.001*"gas" + 0.001*"investments" + 0.001*"business"')]

In [20]:

# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.019*"group" + 0.016*"september" + 0.015*"months" + 0.012*"electricity" + 0.012*"company" + 0.012*"ignitis" + 0.011*"qtr" + 0.009*"assets" + 0.009*"statistics" + 0.008*"cash"'),
 (1,
  '0.004*"group" + 0.003*"company" + 0.003*"electricity" + 0.002*"ignitis" + 0.002*"cash" + 0.002*"eur" + 0.002*"assets" + 0.002*"gas" + 0.002*"management" + 0.002*"board"'),
 (2,
  '0.024*"company" + 0.022*"group" + 0.017*"electricity" + 0.015*"ignitis" + 0.012*"gas" + 0.012*"board" + 0.010*"eur" + 0.009*"management" + 0.008*"generation" + 0.008*"revenue"'),
 (3,
  '0.024*"company" + 0.019*"group" + 0.015*"electricity" + 0.014*"ignitis" + 0.010*"gas" + 0.009*"eur" + 0.009*"board" + 0.009*"management" + 0.009*"lithuania" + 0.008*"assets"')]

In [21]:
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [22]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.text.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,text
ignitis2020_q1_en_eur_con_ias.txt,interim report interim report first quarter fi...
ignitis2020_q2_en_eur_con_ias.txt,interim report interim report first half year ...
ignitis2020_q2_en_eur_con_ias_00.txt,interim report interim report first half year ...
ignitis2020_q3_en_eur_con_ias.txt,interim report months interim report consolida...


In [24]:

# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.text)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,abbreviation,ability,abovementioned,accelerated,access,accessible,accident,accidents,accountability,accountable,...,winter,withdrawn,women,won,wording,worker,workforce,worth,yearly,zones
ignitis2020_q1_en_eur_con_ias.txt,1,0,0,1,1,0,1,1,0,0,...,1,1,0,0,0,0,0,0,0,2
ignitis2020_q2_en_eur_con_ias.txt,1,2,0,0,1,0,0,0,0,0,...,0,1,22,1,0,2,4,1,0,0
ignitis2020_q2_en_eur_con_ias_00.txt,1,0,1,0,1,0,2,0,0,0,...,0,1,22,1,1,2,4,1,0,14
ignitis2020_q3_en_eur_con_ias.txt,0,0,1,0,0,1,2,0,1,1,...,0,0,2,0,1,0,2,0,1,13


In [25]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [26]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.038*"qtr" + 0.013*"reclassifications" + 0.013*"land" + 0.010*"methodology" + 0.008*"discount" + 0.008*"servitudes" + 0.007*"methods" + 0.007*"november" + 0.007*"sustainability" + 0.006*"coefficient"'),
 (1,
  '0.027*"half" + 0.013*"gri" + 0.011*"women" + 0.011*"men" + 0.010*"ungc" + 0.007*"land" + 0.007*"reclassifications" + 0.007*"sasb" + 0.007*"sdg" + 0.007*"claim"')]

In [27]:
# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.041*"qtr" + 0.014*"reclassifications" + 0.014*"land" + 0.010*"methodology" + 0.008*"servitudes" + 0.008*"discount" + 0.008*"methods" + 0.008*"november" + 0.007*"sustainability" + 0.007*"coefficient"'),
 (1,
  '0.028*"half" + 0.014*"gri" + 0.012*"women" + 0.011*"men" + 0.010*"ungc" + 0.008*"sasb" + 0.008*"sdg" + 0.007*"land" + 0.007*"reclassifications" + 0.007*"claim"'),
 (2,
  '0.001*"half" + 0.001*"land" + 0.001*"qtr" + 0.001*"reclassifications" + 0.001*"gri" + 0.001*"ungc" + 0.001*"men" + 0.001*"women" + 0.001*"sustainability" + 0.001*"esg"')]

In [28]:
# Let's try 4 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.020*"explanatory" + 0.011*"acquisitions" + 0.011*"corrections" + 0.008*"translation" + 0.008*"delays" + 0.008*"treatment" + 0.006*"condensed" + 0.006*"errors" + 0.006*"decline" + 0.005*"males"'),
 (1,
  '0.044*"qtr" + 0.015*"reclassifications" + 0.014*"land" + 0.011*"methodology" + 0.009*"servitudes" + 0.009*"discount" + 0.008*"methods" + 0.008*"november" + 0.008*"sustainability" + 0.007*"subsequent"'),
 (2,
  '0.037*"half" + 0.014*"land" + 0.014*"reclassifications" + 0.013*"gri" + 0.011*"women" + 0.011*"men" + 0.010*"methodology" + 0.010*"ungc" + 0.010*"captions" + 0.009*"table"'),
 (3,
  '0.023*"half" + 0.018*"gri" + 0.015*"women" + 0.014*"men" + 0.013*"ungc" + 0.013*"condensed" + 0.010*"sdg" + 0.010*"sasb" + 0.008*"esg" + 0.008*"gj"')]

In [29]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldana.print_topics()

[(0,
  '0.001*"course" + 0.001*"partnership" + 0.001*"allocated" + 0.001*"smooth" + 0.001*"announced" + 0.001*"seat" + 0.001*"certificates" + 0.001*"half" + 0.001*"care" + 0.001*"geographical"'),
 (1,
  '0.020*"explanatory" + 0.011*"corrections" + 0.011*"acquisitions" + 0.008*"treatment" + 0.008*"delays" + 0.008*"translation" + 0.006*"errors" + 0.006*"condensed" + 0.006*"decline" + 0.005*"owners"'),
 (2,
  '0.044*"qtr" + 0.015*"reclassifications" + 0.014*"land" + 0.011*"methodology" + 0.009*"servitudes" + 0.009*"discount" + 0.008*"november" + 0.008*"methods" + 0.008*"sustainability" + 0.007*"calculation"'),
 (3,
  '0.034*"half" + 0.017*"gri" + 0.014*"women" + 0.013*"men" + 0.012*"ungc" + 0.009*"sdg" + 0.009*"sasb" + 0.009*"land" + 0.009*"reclassifications" + 0.008*"claim"')]

In [30]:
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

[(1, 'ignitis2020_q1_en_eur_con_ias.txt'),
 (3, 'ignitis2020_q2_en_eur_con_ias.txt'),
 (3, 'ignitis2020_q2_en_eur_con_ias_00.txt'),
 (2, 'ignitis2020_q3_en_eur_con_ias.txt')]