# Exploring Topic and Word Relationships

Here we explore different ways that the the newstest 2015 dataset can be looked at.

In [2]:
import math

def read_file(file_path, max_amount=math.inf):
    """
    Reads a number of lines from file and return list of lines
    :param file_path:
    :param max_amount:
    :return: list[str]
    """
    line_list = []

    with open(file=file_path) as f:
        curr_idx = 0
        while curr_idx < max_amount:
            line_list.append(f.readline())
            curr_idx += 1

    return line_list

print("Loading file")
l_raw = read_file('./trg.shuf', 5000000)
print("Finished loading in file")

Loading file
Finished loading in file


In [28]:
import math
import pandas as pd
from collections import Counter
import operator
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
import gensim
from gensim import corpora



def process_word(word, remove_stop=True, lemmatize_and_morph=True):
    """
    Returns process word
    :param word:
    :return: str word
    """
    # lower
    word = word.lower()
    
    # Remove symbols
    if len(word) < 3:
        return None
    
    # Get morph
    if lemmatize_and_morph is True:
        t_word = wn.morphy(word)
        if t_word is not None:
            word = t_word

        # Get lemma
        word = WordNetLemmatizer().lemmatize(word)
    
    # Remove stop
    if remove_stop is True and word in en_stop:
        return None
    return word


def preprocess_all_words(word_list):
    ret = []
    for line in word_list:
        temp = []
        for word in line:
            w = process_word(word, remove_stop=True, lemmatize_and_morph=False)
            if w is not None:
                temp.append(w)
        ret.append(temp)
    return ret


def process_bpe(line_list, seq="@@", rm_words=["\n"], lemmatize_and_morph=True):
    """
    Transforms line list form read_file to list[list[str=words]], and bpe words merged together
    :param line_list:
    :param seq:
    :param rm_words:
    :return: list[list[str=words]]
    """
    full_word_list = []
    for line in line_list:
        temp_words = line.split()
        full_words = []
        idx = 0
        while idx < len(temp_words):
            if temp_words[idx].endswith(seq):
                temp_str = ""
                while temp_words[idx].endswith(seq):
                    temp_str += temp_words[idx][:-(len(seq))]
                    idx += 1
                temp_str += temp_words[idx]
                if temp_str not in rm_words:
                    w = process_word(temp_str, lemmatize_and_morph=lemmatize_and_morph)
                    if w is not None:
                        full_words.append(w)
                idx += 1
            else:
                if temp_words[idx] not in rm_words:
                    w = process_word(temp_words[idx], lemmatize_and_morph=lemmatize_and_morph)
                    if w is not None:
                        full_words.append(w)
                idx += 1
        full_word_list.append(full_words)
    return full_word_list


def get_word_count(word_list):
    """
    Returns word count.
    :param word_list:
    :return: List[(str=word, int=count)]
    """
    word_count = {}
    for line in word_list:
        for word, val in Counter(line).most_common():
            if word in word_count:
                word_count[word] += val
            else:
                word_count[word] = val

    return sorted(word_count.items(), key=operator.itemgetter(1))


def get_topics_lda(word_list, dictionary, num_topics=10):
    corpus = [dictionary.doc2bow(text) for text in word_list]
    print("Finished corpus")
    
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = ldamodel.print_topics(num_words=4)
    print("Topics: ")
    for topic in topics:
        print(topic)

    return topics, ldamodel, corpus, dictionary

import random
l = random.sample(l_raw, 50000)

wl = process_bpe(line_list=l)
print("Vocab size train: " + str(len(get_word_count(word_list=wl))))

wl_full = process_bpe(line_list=l_raw, lemmatize_and_morph=False)
print("Vocab size full: " + str(len(get_word_count(word_list=wl_full))))

dictionary = corpora.Dictionary(wl_full)
print("Finished Dictionary")

t, ldamodel,_, dictionary = get_topics_lda(word_list=preprocess_all_words(wl), dictionary=dictionary)




[nltk_data] Downloading package wordnet to /home/nikita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/nikita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Vocab size train: 47663
Vocab size full: 1001125
Finished Dictionary
Finished corpus
Topics: 
(0, '0.004*"work" + 0.003*"union" + 0.003*"know" + 0.003*"government"')
(1, '0.009*"night" + 0.008*"2016" + 0.006*"sat" + 0.003*"currently"')
(2, '0.002*"follow" + 0.001*"hard" + 0.001*"traffic" + 0.001*"utc"')
(3, '0.002*"love" + 0.002*"operation" + 0.001*"longer" + 0.001*"rest"')
(4, '0.006*"http" + 0.003*"2013" + 0.001*"animal" + 0.001*"upload"')
(5, '0.006*"say" + 0.006*"time" + 0.006*"also" + 0.005*"one"')
(6, '0.002*"legal" + 0.002*"visit" + 0.002*"number" + 0.001*"either"')
(7, '0.003*"end" + 0.002*"2007" + 0.002*"vote" + 0.002*"action"')
(8, '0.002*"strong" + 0.002*"run" + 0.002*"lord" + 0.001*"island"')
(9, '0.005*"post" + 0.004*"2010" + 0.004*"2015" + 0.003*"2011"')





# Visualization
Here we visualize with pyLDAvis in an interactive mode the LDA analysis.

In [21]:
import pyLDAvis.gensim
from gensim import corpora
dictionary = corpora.Dictionary(wl)
corpus = [dictionary.doc2bow(text) for text in wl]
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

ValidationError: 
 * Not all rows (distributions) in topic_term_dists sum to 1.

# Vocab Splitting

Here we get the vocab for each topic.

In [29]:
def get_representatives_lda(word_list, lda_model, dictionary, num_topics=10):
    max_line = [[]] * num_topics
    max_class = [0] * num_topics 
    class_vocab = {i:{} for i in range(0, num_topics)}
    
    for line, idx in zip(word_list, range(len(word_list))):
        #print(line)
        new_doc_bow = dictionary.doc2bow(line)
        s = ldamodel.get_document_topics(new_doc_bow)
        s.sort(key=lambda x: x[1])
        t_max_class, t_max_perc = s[-1]
        
        for word in line:
            if word in class_vocab[t_max_class]:
                class_vocab[t_max_class][word] += 1
            else:
                class_vocab[t_max_class][word] = 1
        
        if max_class[t_max_class] < t_max_perc:
            max_class[t_max_class] = t_max_perc
            max_line[t_max_class] = line
        
        if idx % 1000000 == 0:
            print("Representatives: " + str(idx))
        
    print("Finished representatives")
    return max_line, max_class, class_vocab

# max_l, max_c, class_vocab = get_representatives_lda(wl, ldamodel, dictionary)
"""
for idx in range(len(max_l)):
    print("\n")
    print("\n")
    print(max_c[idx])
    print(max_l[idx])
"""  

# get overlap
num_topics = 10

import numpy as np


def get_overlap_matrix(class_vocab, num_topics):
    over_lap_matrix = np.ones((num_topics, num_topics))

    for c in range(0, num_topics):
        for c2 in range(0, num_topics):
            # Skip self
            if c2 == c:
                continue
            # overlap
            words_in_c2_and_c1 = 0
            total_words_c2 = 0
            for word in class_vocab[c2]:
                total_words_c2 += 1  # class_vocab[c2][word]
                if word in class_vocab[c]:
                    words_in_c2_and_c1 += 1
            c_factor = len(class_vocab[c])/(len(class_vocab[c2]) + len(class_vocab[c]))
            c2_factor = len(class_vocab[c2])/(len(class_vocab[c2]) + len(class_vocab[c]))
            over_lap_matrix[c, c2] = (words_in_c2_and_c1/len(class_vocab[c2])) * c2_factor + (words_in_c2_and_c1/len(class_vocab[c])) * c_factor
    
    return over_lap_matrix


over_lap_matrix = get_overlap_matrix(get_representatives_lda(wl_full, ldamodel, dictionary, num_topics=num_topics)[2], num_topics)

print("Overlap Matrix of Vocab")
print(pd.DataFrame(over_lap_matrix))
print("\n")

print("Average for each topic")
# Remove diagonal as it skews data
print(pd.DataFrame(np.average(over_lap_matrix[~np.eye(over_lap_matrix.shape[0],dtype=bool)].reshape(over_lap_matrix.shape[0],-1), axis=1)))
print("\n")

print("Average overall")
print(np.average(over_lap_matrix[~np.eye(over_lap_matrix.shape[0],dtype=bool)].reshape(over_lap_matrix.shape[0],-1)))

print("\n")
print("Max")
print(np.max(over_lap_matrix[~np.eye(over_lap_matrix.shape[0],dtype=bool)].reshape(over_lap_matrix.shape[0],-1)))
                

Representatives: 0
Representatives: 1000000
Representatives: 2000000
Representatives: 3000000
Representatives: 4000000
Finished representatives
Overlap Matrix of Vocab
          0         1         2         3         4         5         6  \
0  1.000000  0.262265  0.301333  0.314347  0.249747  0.376862  0.302214   
1  0.262265  1.000000  0.329125  0.356942  0.260619  0.164475  0.332742   
2  0.301333  0.329125  1.000000  0.370811  0.278851  0.194584  0.353065   
3  0.314347  0.356942  0.370811  1.000000  0.287529  0.201706  0.357996   
4  0.249747  0.260619  0.278851  0.287529  1.000000  0.176875  0.281677   
5  0.376862  0.164475  0.194584  0.201706  0.176875  1.000000  0.195910   
6  0.302214  0.332742  0.353065  0.357996  0.281677  0.195910  1.000000   
7  0.431245  0.280087  0.313813  0.337540  0.254447  0.348640  0.310561   
8  0.292112  0.354993  0.353636  0.386545  0.272880  0.184643  0.347523   
9  0.300844  0.293045  0.292126  0.311669  0.249143  0.212005  0.301896   

      

# Results



## LDA
### 10 Topics, 50000 clustering, 5 million testing:

With stop words, overall average roughly 36% (max 44%, small deviation)

Without stop words, overall average roughly 29% (max 43%).

### TODO: 5, 20 Topics