## TF-IDF with clustering reduction

Based on the paper: <i>Fuzzy Bag-of-Words Model for Document Representation</i>. The idea is to use clustering to condense individual words into semantically-similar groupings of words prior to generating TF-IDF features. This, among other things, increases the co-occurrence information for rare terms, which is important when text is exceedingly short. Note: "Topic" is used to describe the grouping of words with similar meanings.

In [1]:
from collections import Counter,defaultdict
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ast
import json
import spacy
from sklearn.cluster import AgglomerativeClustering,KMeans # for clustering word embeddings
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

### Data cleaning

In [2]:
with open("../data/word_vectors.json") as word_vector_file: # processed word embeddings
    word_embeddings_dict = json.load(word_vector_file)
for word in word_embeddings_dict: # changing all word vectors to np.arrays
    word_embeddings_dict[word]['vec'] = np.array(word_embeddings_dict[word]['vec'])

In [3]:
cdf = pd.read_csv("../data/cleaned_data.csv")
cdf['problem_tokens'] = cdf['problem_tokens'].apply(lambda list_string: ast.literal_eval(list_string))
problem_tokens = cdf['problem_tokens'].tolist()
corpus = problem_tokens

In [4]:
all_tokens = []
for problem in problem_tokens:
    all_tokens += problem

token_counts = Counter(all_tokens)
common_words = [] # defined as having more than a single occurrence
uncommon_words = []
for word in token_counts:
    if token_counts[word]>1:
        common_words.append(word)
    else:
        uncommon_words.append(word)
        
len(common_words),len(uncommon_words)

(7572, 5021)

In [5]:
# summary stats of distribution of token occurrences (in essence the data is largely filled with short texts)
sorted_counts = list(token_counts.items())
sorted_counts.sort(key=lambda tup: tup[1])
counts = [tup[1] for tup in sorted_counts]
unigram_avg = round(float(np.average(counts)),1)
unigram_median = round(float(np.median(counts)),1)

print((unigram_avg,unigram_median))

(33.0, 2.0)


In [6]:
# get the vectors for each individual word
all_words = common_words+uncommon_words
all_word_vectors = []

for word in all_words:
    vec = word_embeddings_dict[word]['vec']
    all_word_vectors.append(vec)
    
all_word_vectors = np.stack(all_word_vectors)
all_word_vectors.shape,len(all_words)

((12593, 300), 12593)

### Standard condensed representations using only common words

Hierarchical clustering is utilized due to the efficiency of being able to set a distance threshold.

In [7]:
def get_topic_assignments_per_word(clusterer,words,word_embeddings):
    """ returns both the words per topic cluster and the topic cluster per word
    """
    labels = clusterer.fit_predict(word_embeddings)
    word_cluster_assignment = {} # assigns each word to a unique cluster
    cluster_words_assignment = defaultdict(list) # stores words assigned to each cluster
    for i,word in enumerate(words):
        cluster_label = labels[i]
        word_cluster_assignment[word] = cluster_label
        cluster_words_assignment[cluster_label].append(word)
    
    return word_cluster_assignment,cluster_words_assignment

In [8]:
def get_summary_stats_for_topic_clusters(token_counts,cluster_words_assignment):
    """ returns the median and mean of the number of occurrences for each topic
        -the count for a given topic is the summation of the counts for the individual tokens in that topic
    """
    new_topic_counts = []
    for topic,words_in_topic in cluster_words_assignment.items():
        total_topic_count = 0
        for word in words_in_topic:
            total_topic_count += token_counts[word]
        new_topic_counts.append(total_topic_count)
        
    mean = round(float(np.average(new_topic_counts)),1)
    median = round(float(np.median(new_topic_counts)),1)
    return mean,median    

In [12]:
common_word_vectors = []

for word in common_words:
    vec = word_embeddings_dict[word]['vec']
    common_word_vectors.append(vec)
    
common_word_vectors = np.stack(common_word_vectors)
print(common_word_vectors.shape,len(common_words))

((7572, 300), 7572)

In [17]:
def get_tfidf_features_for_reduced_topics_common(old_corpus,word_cluster_assignment,common_words):
    """ returns the tf-idf features after reducing the keywords into their associated cluster topic assignments
        -removes the uncommon words
    """
    common_words = set(common_words)
    corpus = [] # removing the uncommon words from the old corpus
    for words in old_corpus:
        new_words = []
        for word in words:
            if word in common_words:
                new_words.append(word)
        corpus.append(new_words)
    
    updated_corpus = [] # generating a new corpus which represents words in a grouping as a single "word"
    for tokens in corpus:
        new_tokens = []
        for token in tokens:
            token_topic = "topic"+str(word_cluster_assignment[token]) # cluster this token belongs to
            new_tokens.append(token_topic)
        updated_corpus.append(new_tokens)
        
    vectorizer = TfidfVectorizer(ngram_range=(1,1),max_features=None,lowercase=False,preprocessor=lambda x:x,tokenizer=lambda x:x)
    tfidf_vectors = vectorizer.fit_transform(updated_corpus)
    tfidf_vectors = np.array(tfidf_vectors.todense())
    return tfidf_vectors

In [18]:
clusterer = AgglomerativeClustering(n_clusters=None,compute_full_tree=True,distance_threshold=6.0)
word_cluster_assignment,cluster_words_assignment = get_topic_assignments_per_word(clusterer,common_words,common_word_vectors)
tfidf_vectors = get_tfidf_features_for_reduced_topics_common(corpus,word_cluster_assignment,common_words)
print(tfidf_vectors.shape)

(67152, 4440)


In [20]:
clusterer = AgglomerativeClustering(n_clusters=None,compute_full_tree=True,distance_threshold=7.0)
word_cluster_assignment,cluster_words_assignment = get_topic_assignments_per_word(clusterer,common_words,common_word_vectors)
tfidf_vectors = get_tfidf_features_for_reduced_topics_common(corpus,word_cluster_assignment,common_words)
print(tfidf_vectors.shape)

(67152, 3048)


### Variant of the standard approach in which only uncommon words are grouped

In this case, uncommon words are defined to have <= median=2 occurrences.

In [13]:
median_common_words = [] # <= median occurrences is considered a common word
median_uncommon_words = []
for word in token_counts:
    if token_counts[word]>2:
        median_common_words.append(word)
    else:
        median_uncommon_words.append(word)
        
len(median_common_words),len(median_uncommon_words)

(5952, 6641)

In [16]:
median_uncommon_word_vectors = []

for word in median_uncommon_words:
    vec = word_embeddings_dict[word]['vec']
    median_uncommon_word_vectors.append(vec)
    
median_uncommon_word_vectors = np.stack(median_uncommon_word_vectors)
median_uncommon_word_vectors.shape,len(median_uncommon_words)

((6641, 300), 6641)

In [22]:
def get_tfidf_features_for_reduced_topics_cluster_uncommon_words(corpus,word_cluster_assignment,cluster_words_assignment,median_common_words):
    """ returns the tf-idf features after reducing the keywords into their associated cluster topic assignments
        -stacks standard tf-idf vectors made from common words with one made from grouping uncommon words
    """
    median_common_words = set(median_common_words)
    corpus1 = [] # updated corpus made up of only common words
    for tokens in corpus:
        new_tokens = []
        for token in tokens:
            if token in median_common_words: # this token is a common word
                new_tokens.append(token)
        corpus1.append(new_tokens)
    
    vectorizer = TfidfVectorizer(ngram_range=(1,1),max_features=None,lowercase=False,preprocessor=lambda x:x,tokenizer=lambda x:x)
    tfidf_vectors1 = vectorizer.fit_transform(corpus1)
    tfidf_vectors1 = np.array(tfidf_vectors1.todense())
        
    corpus2 = [] # updated corpus made up of only uncommon words
    for tokens in corpus:
        new_tokens = []
        for token in tokens:
            if token not in median_common_words: # only using uncommon words for this set of features
                token_topic = "topic"+str(word_cluster_assignment[token]) # cluster this token belongs to
                new_tokens.append(token_topic)
        corpus2.append(new_tokens)
        
    vectorizer = TfidfVectorizer(ngram_range=(1,1),max_features=None,lowercase=False,preprocessor=lambda x:x,tokenizer=lambda x:x)
    tfidf_vectors2 = vectorizer.fit_transform(corpus2)
    tfidf_vectors2 = np.array(tfidf_vectors2.todense())
    
    final_tfidf = np.hstack([tfidf_vectors1,tfidf_vectors2])
    return final_tfidf

In [23]:
clusterer = AgglomerativeClustering(n_clusters=None,compute_full_tree=True,distance_threshold=6.0)
word_cluster_assignment,cluster_words_assignment = get_topic_assignments_per_word(clusterer,median_uncommon_words,median_uncommon_word_vectors)
tfidf_vectors = get_tfidf_features_for_reduced_topics_cluster_uncommon_words(corpus,word_cluster_assignment,cluster_words_assignment,median_common_words)
print(tfidf_vectors.shape)

(67152, 11041)


In [26]:
clusterer = AgglomerativeClustering(n_clusters=None,compute_full_tree=True,distance_threshold=7.0)
word_cluster_assignment,cluster_words_assignment = get_topic_assignments_per_word(clusterer,median_uncommon_words,median_uncommon_word_vectors)
tfidf_vectors = get_tfidf_features_for_reduced_topics_cluster_uncommon_words(corpus,word_cluster_assignment,cluster_words_assignment,median_common_words)
print(tfidf_vectors.shape)

(67152, 9709)
