## Iterative feature expansion

Based on the paper: <i>Language Model Information Retrieval with Document Expansion</i>. The idea is to reduce feature sparsity by considering/incorporating neighborhood examples. As put in the paper, "we are looking for a new enlarged document d\` for each document d in a text collection, such that the new document d\` can be used to estimate the hidden generative model of d more accurately". The document features used are TF-IDF. For comparing the similarity between documents/examples, we use average word embeddings.

In [1]:
from collections import Counter,defaultdict
import numpy as np
import pandas as pd
import json
import heapq

import warnings
warnings.filterwarnings("ignore")

### Cleaning the data

In [2]:
# average embeddings for each of the examples are used as the "key"
avg_emb = np.load("../data/features/avg_embedding.npy")

In [3]:
vector_norms = {} # caching for easy access later
for i in range(0,len(avg_emb)):
    vec_norm = np.linalg.norm(avg_emb[i])
    vector_norms[i]=vec_norm

In [None]:
# getting the 100 closest neighbors for each of the datapoints
closest_neighbors = {} # for each datapoint, returns info about the 100 closest other datapoints (indices & similarity)
for i in range(0,len(avg_emb)):
    other_i_s = [num for num in range(0,len(avg_emb)) if num != i] # getting all neighbors
    min_heap = [] # keeping the maximum cosine similarities, datapoints are tups:[(cos_sim,other_index),...]
    for other_i in other_i_s:
        cos_sim = float(np.dot(avg_emb[i],avg_emb[other_i])/(vector_norms[i]*vector_norms[other_i]))
        if len(min_heap)<100:
            heapq.heappush(min_heap,(cos_sim,other_i))
        else:
            if cos_sim > min_heap[0][0]:
                heapq.heappushpop(min_heap,(cos_sim,other_i))
    
    sorted_closest_neighbors = heapq.nlargest(100,min_heap)
    closest_neighbors[i] = sorted_closest_neighbors

In [5]:
with open("closest_100_neighbors.json","w+") as out_file:
    json.dump(closest_neighbors,out_file)

In [4]:
closest_neighbors_file = open("closest_100_neighbors.json")
closest_neighbors = json.load(closest_neighbors_file)

### Feature expansion implementation

In [5]:
def expand_features(feature_fname,closest_neighbors,k=10,lam=0.5):
    """ Updates the features and returns the mean/median increase in number of unique features across datapoints
    args:
        k: the number of "similar" documents to consider (neighborhood size)
        lam: the amount of attention which should be paid to original document
    """
    differences_in_token_count = []
    old_features = np.load("../data/features/"+feature_fname+".npy") # tf-idf
    new_features = []
    for i in range(len(old_features)):
        old_feat = old_features[i]
        old_token_count = np.sum(old_feat!=0)
        old_feat_neighbors = closest_neighbors[str(i)][0:k] # k closest neighbors
        cos_sim_total = sum([tup[0] for tup in old_feat_neighbors]) # the denominator for scaling all k closest neighbors
        
        new_feat = np.zeros(old_feat.shape)
        for cos_sim,data_index in old_feat_neighbors: # weighted sum of old features
            new_feat += ((cos_sim/cos_sim_total)*old_features[data_index])
        new_feat = (lam*old_feat)+((1-lam)*new_feat)
        new_features.append(new_feat)
        
        new_token_count = np.sum(new_feat!=0)
        diff_token_count = new_token_count-old_token_count
        differences_in_token_count.append(diff_token_count)
        
    new_features = np.stack(new_features)
    median_extra_tokens = round(np.median(differences_in_token_count),2)
    mean_extra_tokens = round(np.average(differences_in_token_count),2)
    return new_features,median_extra_tokens,mean_extra_tokens

In [6]:
features = ["tfidf_7572","tfidf_stacked_8_7572_10838","tfidf_topics_7870"] # different tf-idf features
k_s = [10,50]
lam_s = [0.75,0.5,0.25]

for k in k_s:
    for lam in lam_s:
        for feature_name in features:
            new_feature,median_extra_tokens,mean_extra_tokens = expand_features(feature_name,closest_neighbors,k=k,lam=lam)
            np.save("../data/features/"+feature_name+"_k={}_lam={}".format(k,lam)+".npy",new_feature)
            print("k={}; lam={}; {}".format(k,lam,feature_name))
            print((median_extra_tokens,mean_extra_tokens))
            print("-------------------------------------")

k=10; lam=0.75; tfidf_7572
(27.0, 40.77)
-------------------------------------
k=10; lam=0.75; tfidf_stacked_8_7572_10838
(51.0, 74.35)
-------------------------------------
k=10; lam=0.75; tfidf_topics_7870
(25.0, 37.14)
-------------------------------------
k=10; lam=0.5; tfidf_7572
(27.0, 40.77)
-------------------------------------
k=10; lam=0.5; tfidf_stacked_8_7572_10838
(51.0, 74.35)
-------------------------------------
k=10; lam=0.5; tfidf_topics_7870
(25.0, 37.14)
-------------------------------------
k=10; lam=0.25; tfidf_7572
(27.0, 40.77)
-------------------------------------
k=10; lam=0.25; tfidf_stacked_8_7572_10838
(51.0, 74.35)
-------------------------------------
k=10; lam=0.25; tfidf_topics_7870
(25.0, 37.14)
-------------------------------------
k=50; lam=0.75; tfidf_7572
(140.0, 166.02)
-------------------------------------
k=50; lam=0.75; tfidf_stacked_8_7572_10838
(251.0, 291.57)
-------------------------------------
k=50; lam=0.75; tfidf_topics_7870
(125.0, 144