In [23]:
import pandas as pd
import pickle

data = pd.read_pickle('pickles/merkodtm_stop.pkl')
data

Unnamed: 0,aaspere,ab,abated,able,abroad,accept,acceptance,accepted,accepting,accommodation,...,worthiness,written,www,year,years,yield,zakusala,üksnurme,ādaži,šperbergs
merko2020_q1_en_eur_con_00.txt,2,1,1,0,1,1,1,1,0,1,...,0,0,3,17,17,2,1,1,0,2
merko2020_q2_en_eur_con_00.txt,2,1,0,2,1,1,1,1,1,0,...,1,1,4,33,11,2,1,1,0,1
merko2020_q3_en_eur_con_00.txt,2,1,0,1,1,1,1,1,0,0,...,0,1,0,19,15,2,1,0,2,1
merko2020_q4_en_eur_con_00.txt,0,1,0,1,1,1,1,1,0,0,...,0,1,0,30,17,3,1,0,4,1


In [24]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse


In [25]:
tdm = data.transpose()
tdm.head()

Unnamed: 0,merko2020_q1_en_eur_con_00.txt,merko2020_q2_en_eur_con_00.txt,merko2020_q3_en_eur_con_00.txt,merko2020_q4_en_eur_con_00.txt
aaspere,2,2,2,0
ab,1,1,1,1
abated,1,0,0,0
able,0,2,1,1
abroad,1,1,1,1


In [26]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [27]:
cv = pickle.load(open("pickles/merkocv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [28]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.003*"general" + 0.002*"lithuania" + 0.002*"net" + 0.002*"term" + 0.002*"dividends" + 0.002*"period" + 0.002*"joint" + 0.002*"financial" + 0.002*"apartments" + 0.002*"liabilities"'),
 (1,
  '0.006*"apartments" + 0.006*"term" + 0.006*"dividends" + 0.006*"general" + 0.006*"shares" + 0.006*"related" + 0.006*"groups" + 0.006*"financial" + 0.006*"period" + 0.005*"euros"')]

In [7]:
# LDA topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.006*"apartments" + 0.006*"term" + 0.006*"dividends" + 0.006*"general" + 0.006*"shares" + 0.006*"related" + 0.006*"groups" + 0.006*"financial" + 0.006*"period" + 0.006*"lithuania"'),
 (1,
  '0.001*"general" + 0.001*"financial" + 0.001*"period" + 0.001*"term" + 0.001*"related" + 0.001*"lithuania" + 0.001*"projects" + 0.001*"dividends" + 0.001*"apartments" + 0.001*"members"'),
 (2,
  '0.002*"net" + 0.002*"lithuania" + 0.002*"liabilities" + 0.002*"dividends" + 0.002*"related" + 0.002*"euros" + 0.002*"term" + 0.002*"shares" + 0.002*"apartments" + 0.002*"general"')]

In [8]:
# LDA topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.001*"term" + 0.001*"lithuania" + 0.001*"financial" + 0.001*"period" + 0.001*"general" + 0.001*"groups" + 0.001*"related" + 0.001*"apartments" + 0.001*"euros" + 0.001*"latvia"'),
 (1,
  '0.007*"apartments" + 0.006*"dividends" + 0.006*"term" + 0.006*"general" + 0.006*"shares" + 0.006*"related" + 0.006*"financial" + 0.006*"groups" + 0.006*"net" + 0.006*"period"'),
 (2,
  '0.001*"general" + 0.001*"euros" + 0.001*"related" + 0.001*"dividends" + 0.001*"term" + 0.001*"shares" + 0.001*"members" + 0.001*"quarter" + 0.001*"groups" + 0.001*"net"'),
 (3,
  '0.009*"march" + 0.006*"general" + 0.006*"period" + 0.005*"tallinn" + 0.005*"groups" + 0.005*"lithuania" + 0.005*"latvia" + 0.005*"financial" + 0.005*"shares" + 0.005*"term"')]

In [9]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [10]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('pickles/merko.pkl')
data_clean

Unnamed: 0,text
merko2020_q1_en_eur_con_00.txt,as merko ehitus consolidated interim report as...
merko2020_q2_en_eur_con_00.txt,as merko ehitus consolidated interim report as...
merko2020_q3_en_eur_con_00.txt,as merko ehitus consolidated interim report as...
merko2020_q4_en_eur_con_00.txt,as merko ehitus consolidated interim report as...


In [11]:

# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.text.apply(nouns))
data_nouns

Unnamed: 0,text
merko2020_q1_en_eur_con_00.txt,merko ehitus report ehitus group months report...
merko2020_q2_en_eur_con_00.txt,merko ehitus report ehitus group months quarte...
merko2020_q3_en_eur_con_00.txt,merko ehitus report ehitus group months quarte...
merko2020_q4_en_eur_con_00.txt,merko ehitus report ehitus group months quarte...


In [14]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['www','interim','report','statements','period','ab','abbr','yes','žvejų','žaneta','žilvista','živilė','šalys','šarūnas','žukauskas','žūb','įmonės','świnoujście','šiaulių']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.text)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,acceptance,accommodation,accordance,account,accounting,accounts,acquisition,act,action,actions,...,world,worth,worthiness,year,years,yield,zakusala,üksnurme,ādaži,šperbergs
merko2020_q1_en_eur_con_00.txt,1,1,5,3,2,6,3,1,3,3,...,1,1,0,17,17,2,1,1,0,2
merko2020_q2_en_eur_con_00.txt,1,0,5,3,2,7,3,1,3,2,...,3,2,1,33,11,2,1,1,0,1
merko2020_q3_en_eur_con_00.txt,1,0,6,3,5,7,3,1,3,2,...,1,1,0,19,15,2,1,0,2,1
merko2020_q4_en_eur_con_00.txt,1,0,6,3,5,7,3,2,2,1,...,0,2,0,30,17,3,1,0,4,1


In [15]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [16]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.007*"ehitus" + 0.007*"merko" + 0.006*"construction" + 0.006*"eur" + 0.005*"months" + 0.005*"board" + 0.005*"group" + 0.004*"management" + 0.004*"profit" + 0.004*"revenue"'),
 (1,
  '0.026*"construction" + 0.023*"merko" + 0.020*"group" + 0.020*"ehitus" + 0.020*"eur" + 0.018*"management" + 0.018*"months" + 0.017*"board" + 0.016*"profit" + 0.012*"revenue"')]

In [17]:
# Let's try topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.002*"merko" + 0.002*"construction" + 0.002*"ehitus" + 0.002*"eur" + 0.002*"profit" + 0.002*"months" + 0.002*"management" + 0.002*"group" + 0.002*"revenue" + 0.002*"board"'),
 (1,
  '0.026*"construction" + 0.023*"merko" + 0.020*"group" + 0.020*"ehitus" + 0.020*"eur" + 0.018*"management" + 0.018*"months" + 0.017*"board" + 0.016*"profit" + 0.012*"revenue"'),
 (2,
  '0.004*"merko" + 0.003*"group" + 0.003*"management" + 0.003*"ehitus" + 0.003*"months" + 0.002*"construction" + 0.002*"profit" + 0.002*"board" + 0.002*"eur" + 0.002*"revenue"')]

In [18]:

# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.026*"construction" + 0.024*"merko" + 0.020*"group" + 0.020*"ehitus" + 0.020*"eur" + 0.018*"management" + 0.018*"months" + 0.017*"board" + 0.016*"profit" + 0.012*"revenue"'),
 (1,
  '0.006*"construction" + 0.004*"eur" + 0.003*"merko" + 0.003*"management" + 0.003*"profit" + 0.003*"ehitus" + 0.003*"board" + 0.003*"months" + 0.003*"group" + 0.003*"revenue"'),
 (2,
  '0.005*"merko" + 0.005*"construction" + 0.005*"ehitus" + 0.004*"eur" + 0.004*"management" + 0.004*"board" + 0.003*"months" + 0.003*"group" + 0.003*"profit" + 0.003*"revenue"'),
 (3,
  '0.002*"merko" + 0.002*"construction" + 0.002*"ehitus" + 0.002*"management" + 0.002*"group" + 0.002*"eur" + 0.002*"months" + 0.002*"tax" + 0.002*"board" + 0.002*"profit"')]

In [19]:
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [20]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.text.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,text
merko2020_q1_en_eur_con_00.txt,merko ehitus interim report merko ehitus group...
merko2020_q2_en_eur_con_00.txt,merko ehitus interim report merko ehitus group...
merko2020_q3_en_eur_con_00.txt,merko ehitus interim report merko ehitus group...
merko2020_q4_en_eur_con_00.txt,merko ehitus interim report merko ehitus group...


In [21]:

# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.text)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,aaspere,able,accommodation,accumulated,activeness,added,adjustment,adoption,advantage,aggressive,...,weakness,willing,willingness,wind,withdrawals,world,worse,worthiness,üksnurme,ādaži
merko2020_q1_en_eur_con_00.txt,2,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
merko2020_q2_en_eur_con_00.txt,2,2,0,0,0,0,1,1,1,0,...,1,1,0,2,0,3,1,1,1,0
merko2020_q3_en_eur_con_00.txt,2,1,0,1,0,0,1,1,0,0,...,0,1,1,2,0,1,0,0,0,2
merko2020_q4_en_eur_con_00.txt,0,1,0,1,1,1,3,1,0,0,...,0,0,1,2,1,0,0,0,0,4


In [22]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [26]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.038*"qtr" + 0.013*"reclassifications" + 0.013*"land" + 0.010*"methodology" + 0.008*"discount" + 0.008*"servitudes" + 0.007*"methods" + 0.007*"november" + 0.007*"sustainability" + 0.006*"coefficient"'),
 (1,
  '0.027*"half" + 0.013*"gri" + 0.011*"women" + 0.011*"men" + 0.010*"ungc" + 0.007*"land" + 0.007*"reclassifications" + 0.007*"sasb" + 0.007*"sdg" + 0.007*"claim"')]

In [27]:
# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.041*"qtr" + 0.014*"reclassifications" + 0.014*"land" + 0.010*"methodology" + 0.008*"servitudes" + 0.008*"discount" + 0.008*"methods" + 0.008*"november" + 0.007*"sustainability" + 0.007*"coefficient"'),
 (1,
  '0.028*"half" + 0.014*"gri" + 0.012*"women" + 0.011*"men" + 0.010*"ungc" + 0.008*"sasb" + 0.008*"sdg" + 0.007*"land" + 0.007*"reclassifications" + 0.007*"claim"'),
 (2,
  '0.001*"half" + 0.001*"land" + 0.001*"qtr" + 0.001*"reclassifications" + 0.001*"gri" + 0.001*"ungc" + 0.001*"men" + 0.001*"women" + 0.001*"sustainability" + 0.001*"esg"')]

In [28]:
# Let's try 4 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.020*"explanatory" + 0.011*"acquisitions" + 0.011*"corrections" + 0.008*"translation" + 0.008*"delays" + 0.008*"treatment" + 0.006*"condensed" + 0.006*"errors" + 0.006*"decline" + 0.005*"males"'),
 (1,
  '0.044*"qtr" + 0.015*"reclassifications" + 0.014*"land" + 0.011*"methodology" + 0.009*"servitudes" + 0.009*"discount" + 0.008*"methods" + 0.008*"november" + 0.008*"sustainability" + 0.007*"subsequent"'),
 (2,
  '0.037*"half" + 0.014*"land" + 0.014*"reclassifications" + 0.013*"gri" + 0.011*"women" + 0.011*"men" + 0.010*"methodology" + 0.010*"ungc" + 0.010*"captions" + 0.009*"table"'),
 (3,
  '0.023*"half" + 0.018*"gri" + 0.015*"women" + 0.014*"men" + 0.013*"ungc" + 0.013*"condensed" + 0.010*"sdg" + 0.010*"sasb" + 0.008*"esg" + 0.008*"gj"')]

In [29]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldana.print_topics()

[(0,
  '0.001*"course" + 0.001*"partnership" + 0.001*"allocated" + 0.001*"smooth" + 0.001*"announced" + 0.001*"seat" + 0.001*"certificates" + 0.001*"half" + 0.001*"care" + 0.001*"geographical"'),
 (1,
  '0.020*"explanatory" + 0.011*"corrections" + 0.011*"acquisitions" + 0.008*"treatment" + 0.008*"delays" + 0.008*"translation" + 0.006*"errors" + 0.006*"condensed" + 0.006*"decline" + 0.005*"owners"'),
 (2,
  '0.044*"qtr" + 0.015*"reclassifications" + 0.014*"land" + 0.011*"methodology" + 0.009*"servitudes" + 0.009*"discount" + 0.008*"november" + 0.008*"methods" + 0.008*"sustainability" + 0.007*"calculation"'),
 (3,
  '0.034*"half" + 0.017*"gri" + 0.014*"women" + 0.013*"men" + 0.012*"ungc" + 0.009*"sdg" + 0.009*"sasb" + 0.009*"land" + 0.009*"reclassifications" + 0.008*"claim"')]

In [30]:
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

[(1, 'ignitis2020_q1_en_eur_con_ias.txt'),
 (3, 'ignitis2020_q2_en_eur_con_ias.txt'),
 (3, 'ignitis2020_q2_en_eur_con_ias_00.txt'),
 (2, 'ignitis2020_q3_en_eur_con_ias.txt')]