# Tutorial for stopword filtering

Interactive notebook for evaluation of stopword lists using topic models.

In [1]:
## import packages

%load_ext autoreload
%autoreload 2

import os,sys
import numpy as np
import pandas as pd

import timeit
from memory_profiler import memory_usage

# display the figure in the notebook
# %matplotlib inline
# import matplotlib.pyplot as plt
# cmap = 'tab10'
# cm = plt.get_cmap(cmap)

## custom packages
src_dir = os.path.join( 'src')
sys.path.append(src_dir)

from filter_words import run_stopword_statistics
from filter_words import make_stopwords_filter
from filter_words import remove_stopwords_from_list_texts

from real_corpora import tranfer_real_corpus_toID_and_shuffle
from ldavb import ldavb_inference_terminal, obtain_ldavb_cpuTime_memory
from evaluation import obtain_nmi_unsup, state_dwz_nmi

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress

Slow version of gensim.models.word2vec is being used

Slow version of gensim.models.doc2vec is being used


## 1) Load corpus

Get the 20 newsgroup corpus. These are newsarticles from 20 different categories (newsgroups).

We get a list of documents, where each entry is a list of tokens

In [2]:
corpus_name = '20NewsGroup'
filename = os.path.join(os.pardir,'data','%s_corpus.csv'%(corpus_name))
df = pd.read_csv(filename,index_col=0)
list_texts = [  [h.strip() for h in doc.split()  ] for doc in df['text']    ]
list_texts[0] ## this is the first doc

['new',

 'religion',

 'forming',

 'sign',

 'yawn',

 'the',

 'church',

 'kibology',

 'did',

 'first',

 'and',

 'better']

In [5]:
## get topic labels and convert to interger-ids
list_topics = [ doc  for doc in df['label']    ]
list_topics_unique = list(set(list_topics))
list_topics_id = [list_topics_unique.index(i) for i in list_topics]

list_topics[:10] ## the category labels of the first 10 documents
# len(list_topics_unique), len(list_topics_id), list_topics_id[:10]

['talk_religion_misc',

 'talk_religion_misc',

 'talk_religion_misc',

 'talk_religion_misc',

 'talk_religion_misc',

 'talk_religion_misc',

 'talk_religion_misc',

 'talk_religion_misc',

 'talk_religion_misc',

 'talk_religion_misc']

## 2) Get stopword statistics

We calculate different statistics for each word in order to construct different stopword-filters:

- F, relative frequency
- I, Information content
- tfidf, term-frequency-inverse-document-frequency
- manual, whether the word occurs in the manual stopword list (1), otherwise nan


- H, empirical conditional entropy
- H-tilde, expected conditional entropy from randomized null model
- N, frequncy (number of counts)



In [7]:
# %%time
## path to a manual stopword list (this one is from mallet)
path_stopword_list =  os.path.join(os.pardir,'data','stopword_list_en')

## number of realizations for the random null model
N_s = 10

## get the statistics
df = run_stopword_statistics(list_texts,N_s=N_s,path_stopword_list=path_stopword_list)

## look at the entries
df.sort_values(by='F',ascending=False).head()

Unnamed: 0,F,I,tfidf,manual,H,H-tilde,H-tilde_std,N
the,0.062401,0.245335,1.007189,1.0,12.982312,13.227648,0.003651,239094
and,0.024848,0.332028,1.142009,1.0,12.800792,13.132819,0.007405,95205
that,0.016991,0.29224,1.679582,1.0,12.76476,13.057,0.006968,65103
for,0.011996,0.053331,1.088629,1.0,12.916255,12.969587,0.008276,45965
you,0.01162,0.456266,2.252701,1.0,12.497241,12.953507,0.008952,44521


## 3)  Construct a stopword filter

We construct different stopword filters based in different statistics.

For this we have to specify 3 different components:

- A) method; this specifies the statistic that we use to construct the stopword list. In detail, we define a statistic $S(w)$ and assign words to the stopword list starting from the low-to-high (e.g. $S(w) = F(w)$ assign low-frequency words to the stopword list). Possible options are:

    - 'INFOR',  filter words with high values of Information-content I [S=-I]
    - 'BOTTOM', filter words with low values of frequency [S = F]
    - 'TOP', filter words with high values of frequency [S = 1/F]
    - 'TFIDF', filter words with low values of tfidf [S=tfidf]
    - 'TFIDF_r', filter words with high values of tfidf [S=-tfidf]
    - 'MANUAL', filter words from manual stopword list; supply path via path_stopword_list (S = 1 if word is in the list, else it is nan, i.e. cannot be considered for removal.
        
        
- B) cutoff_type [defines the way in which we choose the cutoff]

     - 'p', selects stopword list such that a fraction p of tokens gets removed (approximately)
     - 'n', selects stopword list such that a number n of types gets removed
     - 't', selects stopword list such that all words with S<=S_t get removed
    
    
 
- C) cutoff_val [defines the value on which to do the thresholding, see cutoff_type for details]



Below you can select different options and inspect the result.

The resulting dataframe ```df_filter``` contains the words that were assigned to the stopword list based on the selection criteria.

In [8]:
## method-options
# method = 'INFOR'
# method = 'BOTTOM'
# method = 'TOP'
# method = 'TFIDF'
# method = 'TFIDF_r'
method = 'MANUAL'



## remove fraction of tokens
cutoff_type = 'p'
cutoff_val = 0.4

## remove number of types
# cutoff_type = 'n'
# cutoff_val = 10

## remove above a threshold value
# cutoff_type = 't'
# cutoff_val = 1

df_filter = make_stopwords_filter(df,
                                  method = method,
                                  cutoff_type = cutoff_type, 
                                  cutoff_val = cutoff_val, )

In [9]:
df_filter

Unnamed: 0,F-cumsum,S
able,0.000443,1.0
about,0.004027,1.0
above,0.004478,1.0
according,0.004690,1.0
accordingly,0.004695,1.0
across,0.004820,1.0
actually,0.005356,1.0
after,0.006414,1.0
afterwards,0.006432,1.0
again,0.007023,1.0


## 4) Apply the stopword-filter to remove the words from the list of texts

We inspect one particular document for the effect of the stopword filter.

We report the remaining faction of tokens in the filtered list of texts.

In [10]:
## get the list of words from df_filter and get a filtered list_of_texts
list_words_filter = list(df_filter.index)
list_texts_filter = remove_stopwords_from_list_texts(list_texts, list_words_filter)

print('Original text:', list_texts[0])
print('Filtered text:', list_texts_filter[0])
N = sum([ len(doc) for doc in list_texts ])
N_filter = sum([ len(doc) for doc in list_texts_filter ])
print('Remaining fraction of tokens',N_filter/N)

Original text: ['new', 'religion', 'forming', 'sign', 'yawn', 'the', 'church', 'kibology', 'did', 'first', 'and', 'better']

Filtered text: ['religion', 'forming', 'sign', 'yawn', 'church', 'kibology']

Remaining fraction of tokens 0.6004331396175813


In [11]:
# len(list_texts), len(list_texts_filter)

Note that:
In our work, if there are empty documents after stopword removal, we will remove the empty documents and randomly assign a category to these documents during the document classification evaluation task.

## 5) Run topic modeling algorithm: ldavb & evaluation

After running topic modleing algorithm, we retrieve the following metrics for its performance:

- Accuracy

- Reproducibility

- Time and memory

- Coherence

Note that the results for the LDAVB topic model algorithm shown here is reported in Supplementary Figures 8 A and B.

In [12]:
# input_k is the assumed number of topics for LDAVB
input_k = 20

In [13]:
# %%time
# Run the topic model
shuffle_texts_list, shuffle_topic_list = tranfer_real_corpus_toID_and_shuffle(list_texts_filter, list_topics_id)
dict_output_topicModel = ldavb_inference_terminal(shuffle_texts_list, input_k, flag_coherence=1)

In [14]:
# output results of ldavb
dict_output_topicModel.keys()

dict_keys(['p_td_infer', 'p_wt_infer', 'coherence', 'state_dwz_infer'])

### Metric 1: Accuracy

We measure accuracy by calculating the Normalized mutual information between each document's category label assigned from the inferred topic model (the most likely topic) and the metadata's category-label. 

In [16]:
## the inferred topic distribution for each document
p_td_infer = dict_output_topicModel['p_td_infer']

## we compare the topic distribution with the category labels
unsupervised_classification_nmi = obtain_nmi_unsup(shuffle_topic_list, p_td_infer)
unsupervised_classification_nmi

0.3962771831426493


### Metric 2: reproducibility

We measure reproducibility by comparing the solutions of two runs of the same topic model. Thus, we re-run the inference and compare with the previous inference result.

In [14]:
# %%time 
dict_output_topicModel_2 = ldavb_inference_terminal(shuffle_texts_list, input_k, flag_coherence=1)

In [15]:
## topic-labels for each word token in 1st run
state_dwz_infer = dict_output_topicModel['state_dwz_infer']

## topic-labels for each word token in 2nd run
state_dwz_infer_2 = dict_output_topicModel_2['state_dwz_infer']

In [16]:
## compare topic labels from 1st and 2nd run
reproducibility_final = state_dwz_nmi(state_dwz_infer, state_dwz_infer_2, input_k, input_k)
reproducibility_final

0.1834962449776976

## Additional metrics

### get the computational time and memory used in the process

In [17]:
# %%time
elapsed_time, increment_memory = obtain_ldavb_cpuTime_memory(shuffle_texts_list, input_k)

In [18]:
elapsed_time, increment_memory

(33.04066830701777, 20.0)

### get the mean coherence over all topics

In [19]:
coherence_array = dict_output_topicModel['coherence']
coherence_mean = coherence_array.mean()
coherence_mean

-1.9597395901826817

## end

In [20]:
1

1