In [None]:
from utilities import url_to_corpus

cranfield_docs_url = "cranfield/cran_docs.json"
cranfield_queries_url = "cranfield/cran_queries.json"

docs,types_docs = url_to_corpus(cranfield_docs_url,'body')
queries,types_queries = url_to_corpus(cranfield_queries_url,'query')

types = list(types_docs.union(types_queries))

In [None]:
import gensim

model_fasttext = gensim.models.KeyedVectors.load_word2vec_format("fasttext.vec", binary=False)
# model_glove = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False)
model_word2vec = gensim.models.KeyedVectors.load_word2vec_format('pretrained_vectors\GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
from utilities import docs_to_embeddings

doc_embeddings_fasttext = docs_to_embeddings(docs, model_fasttext)
doc_embeddings_word2vec = docs_to_embeddings(docs, model_word2vec)
query_embeddings_fasttext = docs_to_embeddings(queries, model_fasttext)
query_embeddings_word2vec = docs_to_embeddings(queries, model_word2vec)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
import numpy as np
from math import log

def compute_pmi(cooc_matrix):
    N = np.sum(cooc_matrix)
    row_totals = np.sum(cooc_matrix, axis=1)
    col_totals = np.sum(cooc_matrix, axis=0)

    pmi_matrix = np.zeros(cooc_matrix.shape)

    for i in range(cooc_matrix.shape[0]):
        for j in range(cooc_matrix.shape[1]):
            pmi = np.log((cooc_matrix[i,j]*N)/((row_totals[i] + 0.1)*(col_totals[j] + 0.1)))
            pmi_matrix[i,j] = max(0, pmi)

    return pmi_matrix

tfidf = TfidfVectorizer()
documents = [' '.join(doc) for doc in docs]
vectorizer = CountVectorizer()
cooc_matrix = vectorizer.fit_transform(documents).T.dot(vectorizer.fit_transform(documents)).toarray()
pmi_matrix = compute_pmi(cooc_matrix)
tfidf_matrix = tfidf.fit_transform(documents)

svd = TruncatedSVD(n_components=300)
embedding_matrix = svd.fit_transform(pmi_matrix)
embeddings_svd = normalize(embedding_matrix, norm='l2', axis=1)

In [None]:

doc_embeddings = []
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(documents)

for i in range(len(documents)):

    doc_embedding = np.zeros(300)
    tfidf_sum = 0

    for j, term in enumerate(vectorizer.get_feature_names()):
        
        tfidf_sum += tfidf_matrix[i, j]
        doc_embedding += embedding_matrix[j] * tfidf_matrix[i, j]
    
    if tfidf_sum != 0: doc_embedding /= tfidf_sum
    doc_embeddings.append(doc_embedding)

doc_embeddings_svd = np.array(doc_embeddings)

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

doc_embeddings_fasttext = np.genfromtxt('cranfield_embeddings/doc_embeddings_fasttext.csv',dtype=float )
query_embeddings_fasttext = np.genfromtxt('cranfield_embeddings/query_embeddings_fasttext.csv',dtype=float )
doc_embeddings_svd = np.genfromtxt('cranfield_embeddings/doc_embeddings_svd.csv',dtype=float )
query_embeddings_svd = np.genfromtxt('cranfield_embeddings/query_embeddings_svd.csv',dtype=float )
doc_embeddings_word2vec = np.genfromtxt('cranfield_embeddings/doc_embeddings_word2vec.csv',dtype=float )
query_embeddings_word2vec = np.genfromtxt('cranfield_embeddings/query_embeddings_word2vec.csv',dtype=float )

ss = StandardScaler()
doc_embeddings_fasttext = ss.fit_transform(doc_embeddings_fasttext)
ss = StandardScaler()
query_embeddings_fasttext = ss.fit_transform(query_embeddings_fasttext)
ss = StandardScaler()
doc_embeddings_svd = ss.fit_transform(doc_embeddings_svd)
ss = StandardScaler()
query_embeddings_svd = ss.fit_transform(query_embeddings_svd)
ss = StandardScaler()
doc_embeddings_word2vec = ss.fit_transform(doc_embeddings_word2vec)
ss = StandardScaler()
query_embeddings_word2vec = ss.fit_transform(query_embeddings_word2vec)


In [None]:
# Elbow @ 20 for KMeans

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

# kmeans_fasttext = []
# gmm_fasttext = []
# kmeans_word2vec = []
# gmm_word2vec = []
kmeans_svd = []
gmm_svd = []

for k in range(2, 50):
    
    kmeans = KMeans(n_clusters=k, random_state=42).fit(doc_embeddings_fasttext)
    kmeans_fasttext.append(kmeans.inertia_)

    kmeans = KMeans(n_clusters=k, random_state=42).fit(doc_embeddings_svd)
    kmeans_svd.append(kmeans.inertia_)

    kmeans = KMeans(n_clusters=k, random_state=42).fit(doc_embeddings_word2vec)
    kmeans_word2vec.append(kmeans.inertia_)

    gmm = GaussianMixture(n_components=k, random_state=42).fit(doc_embeddings_fasttext)
    cluster_assignments = gmm.predict(doc_embeddings_fasttext)
    centroids = np.array([doc_embeddings_fasttext[cluster_assignments == i].mean(axis=0) for i in range(k)])
    distortion = np.sum(np.square(doc_embeddings_fasttext - centroids[cluster_assignments]))
    gmm_fasttext.append(distortion)

    gmm = GaussianMixture(n_components=k, random_state=42).fit(doc_embeddings_svd)
    cluster_assignments = gmm.predict(doc_embeddings_svd)
    centroids = np.array([doc_embeddings_svd[cluster_assignments == i].mean(axis=0) for i in range(k)])
    distortion = np.sum(np.square(doc_embeddings_svd - centroids[cluster_assignments]))
    gmm_svd.append(distortion)

    gmm = GaussianMixture(n_components=k, random_state=42).fit(doc_embeddings_word2vec)
    cluster_assignments = gmm.predict(doc_embeddings_word2vec)
    centroids = np.array([doc_embeddings_word2vec[cluster_assignments == i].mean(axis=0) for i in range(k)])
    distortion = np.sum(np.square(doc_embeddings_word2vec - centroids[cluster_assignments]))
    gmm_word2vec.append(distortion)

plt.figure(figsize=(10,10))

plt.plot(range(2, 50), kmeans_fasttext, label = "K_Means_fasttext")
plt.plot(range(2, 50), gmm_fasttext, label = "GMM_fasttext")

plt.plot(range(2, 50), kmeans_svd, label = "K_Means_svd")
plt.plot(range(2, 50), gmm_svd, label = "GMM_svd")

plt.plot(range(2, 50), kmeans_word2vec, label = "K_Means_word2vec")
plt.plot(range(2, 50), gmm_word2vec, label = "GMM_word2vec")

plt.title('Objective function vs Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Objective function')
plt.legend()
plt.show()

In [None]:
kmeans = KMeans(n_clusters=10, random_state=42).fit(doc_embeddings_fasttext)

doc_clusters = kmeans.predict(doc_embeddings_fasttext)
centroids = kmeans.cluster_centers_
doc_indices = {tuple(centroid): np.where(doc_clusters == i)[0].tolist() for i, centroid in enumerate(centroids)}

centroid_mapping_fasttext = doc_indices

kmeans = KMeans(n_clusters=10, random_state=42).fit(doc_embeddings_word2vec)

doc_clusters = kmeans.predict(doc_embeddings_word2vec)
centroids = kmeans.cluster_centers_
doc_indices = {tuple(centroid): np.where(doc_clusters == i)[0].tolist() for i, centroid in enumerate(centroids)}

centroid_mapping_word2vec = doc_indices

kmeans = KMeans(n_clusters=10, random_state=42).fit(doc_embeddings_svd)

doc_clusters = kmeans.predict(doc_embeddings_svd)
centroids = kmeans.cluster_centers_
doc_indices = {tuple(centroid): np.where(doc_clusters == i)[0].tolist() for i, centroid in enumerate(centroids)}

centroid_mapping_svd = doc_indices

In [None]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a,b):

    return dot(a, b)/(norm(a)*norm(b))

In [None]:
def gen_result(query_embeddings, doc_embeddings, centroid_mapping):

    results = []

    for i, query_embedding in enumerate(query_embeddings):

        distances = [cos_sim(query_embedding,centroid) for centroid in centroid_mapping]
        nearest_centroid = np.argmin(distances)
        nearest_docs = centroid_mapping[list(centroid_mapping.keys())[nearest_centroid]]
        similarities = [cos_sim(doc_embedding,query_embedding) for doc_embedding in doc_embeddings[nearest_docs]]
        sorted_indices = np.argsort(similarities)[::-1]
        sorted_docs = [nearest_docs[i] for i in sorted_indices]
        results.append(sorted_docs)

    return results

svd_results = gen_result(query_embeddings_svd, doc_embeddings_svd, centroid_mapping_svd)
w2v_results = gen_result(query_embeddings_word2vec, doc_embeddings_word2vec, centroid_mapping_word2vec)
ft_results = gen_result(query_embeddings_fasttext, doc_embeddings_fasttext, centroid_mapping_fasttext)

In [None]:
def types_to_idx(types):

    seq_idx = {}

    for t in types : seq_idx[t] = len(seq_idx)

    return seq_idx

seq_idx = types_to_idx(types)
seq_idx['/start'] = len(seq_idx)
seq_idx['/end'] = len(seq_idx)
seq_idx['/unknown'] = len(seq_idx)
seq_idx['/pad'] = len(seq_idx)

def doc_to_seq(docs):
    
    for i in range(len(docs)):
       
        docs[i] = ['/start'] + docs[i] + ['/end']
        docs[i] = [seq_idx.get(word, seq_idx['/unknown']) for word in docs[i]]

    return docs

doc_seq = doc_to_seq(docs)
query_seq = doc_to_seq(queries)

In [None]:
print(len(seq_idx))

In [None]:
INPUT_SIZE = 9192
NUM_LAYERS = 3
HIDDEN_SIZE = 128
EMBEDDING_SIZE = 128
CELL_TYPE = "LSTM"

import torch.nn as nn
import torch
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import torch.nn as nn

class RNNEmbedding(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, embedding_size, cell_type):
        super(RNNEmbedding, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.cell_type = cell_type
        
        # Define RNN cell type based on user input
        if cell_type == "RNN":
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first=True)
        elif cell_type == "LSTM":
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        elif cell_type == "GRU":
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, batch_first=True)
        

    def forward(self, doc_sequences):
        
        doc_sequences = torch.tensor(doc_sequences)
        embedded = self.embedding(doc_sequences)

        
        if self.cell_type == "LSTM":
            output, (hidden, cell) = self.rnn(embedded)
        else:
            output, hidden = self.rnn(embedded)

        doc_embedding = hidden[-1]

        return doc_embedding

    
embedding = RNNEmbedding(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, EMBEDDING_SIZE, CELL_TYPE)
doc_seq_embeddings = np.array([embedding(seq).detach().numpy() for seq in doc_seq])
query_seq_embeddings = np.array([embedding(seq).detach().numpy() for seq in query_seq])

In [None]:
tsne = TSNE(n_components=2, random_state=42)
doc_embeddings_2d = tsne.fit_transform(doc_seq_embeddings)
plt.scatter(doc_embeddings_2d[:,0], doc_embeddings_2d[:,1], label="DOCS")


tsne = TSNE(n_components=2, random_state=42)
doc_embeddings_2d = tsne.fit_transform(query_seq_embeddings)
plt.scatter(doc_embeddings_2d[:,0], doc_embeddings_2d[:,1], label="QUERIES")

plt.legend()

In [None]:
pca = PCA(n_components=2)
doc_seq_embeddings_np_2d = pca.fit_transform(doc_seq_embeddings)
plt.scatter(doc_seq_embeddings_np_2d[:, 0], doc_seq_embeddings_np_2d[:, 1], label="docs")

pca = PCA(n_components=2)
doc_seq_embeddings_np_2d = pca.fit_transform(query_seq_embeddings)
plt.scatter(doc_seq_embeddings_np_2d[:, 0], doc_seq_embeddings_np_2d[:, 1], label="queries")

plt.legend()

In [None]:
import json

with open('cranfield\cran_qrels.json', 'r') as f:
    data = json.load(f)

X = [[int(d['query_num']), int(d['id'])] for d in data]
y = [int(d['position']) for d in data]

In [None]:
def url_to_id(url,mode):

  with open(url, 'r') as f: data = json.load(f)

  num_docs = len(data)
  docs = np.empty(num_docs, dtype='object')

  for i in range(num_docs): docs[i] = int(data[i][mode])

  return docs

doc_id = url_to_id("cranfield\cran_docs.json","id")
query_id = url_to_id("cranfield\cran_queries.json","query number")

In [None]:
max_seq_length = 0

for seq in docs:
    if len(seq) > max_seq_length : max_seq_length = len(seq)

for seq in queries:
    if len(seq) > max_seq_length : max_seq_length = len(seq)

print(max_seq_length)

def types_to_idx(types):

    seq_idx = {}

    for t in types : seq_idx[t] = len(seq_idx)

    return seq_idx

seq_idx = types_to_idx(types)
seq_idx['/start'] = len(seq_idx)
seq_idx['/end'] = len(seq_idx)
seq_idx['/unknown'] = len(seq_idx)
seq_idx['/pad'] = len(seq_idx)

print(seq_idx)

def doc_to_seq(docs, seq_idx, max_seq_length, mode):

    seqs = []
    
    for doc in docs:

        seq = []
        seq += doc
        
        if(mode=='pad') : 
            
            print("Done")
            seq.insert(0,'/start')
            seq.append('/end')
            while(len(seq)<max_seq_length) : seq.insert(-1,'/pad')

        seq = [seq_idx[word] for word in seq]
        seqs.append(seq)
        
    return seqs

doc_seq = np.array(doc_to_seq(docs, seq_idx, max_seq_length, 'pad'))
#query_seq = np.array(doc_to_seq(queries, seq_idx, max_seq_length, 'pad'))

print(doc_seq.shape)
print(query_seq.shape)

np.savetxt('cranfield_sequences/doc_seq.csv', doc_seq, fmt='%s')
np.savetxt('cranfield_sequences/q_seq.csv', query_seq, fmt='%s')

In [None]:
query_seq[0]

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokens = docs + queries
tokens = [f"<start> {seq} <end>" for seq in tokens]  # adding '/start' and '/end' to every seq
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
sequences = tokenizer.texts_to_sequences(tokens)
max_length = 380
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

np.savetxt('cranfield_sequences/doc_seq.csv', padded_sequences[:len(docs)], fmt='%s')
np.savetxt('cranfield_sequences/q_seq.csv', padded_sequences[len(docs):], fmt='%s')

In [None]:
pretrained_embeddings = np.zeros((9000,300))

token = 1

for i in range(len(tokenizer.index_word)):

    if(str(tokenizer.index_word[i+1]).strip("'") in model_word2vec) : pretrained_embeddings[i+1] = model_word2vec[str(tokenizer.index_word[i+1]).strip("'")]

print(pretrained_embeddings.shape)



In [None]:
np.savetxt('pretrained_embeddings.csv', pretrained_embeddings, fmt="%s")

In [None]:
from utilities import train

train(doc_embeddings_fasttext, query_embeddings_fasttext)

In [None]:
train(doc_embeddings_word2vec, query_embeddings_word2vec)

In [None]:
train(doc_embeddings_svd, query_embeddings_svd)

In [None]:
from utilities import clustering

def clustering(X, random_state, r):

    n_clusters_range = r
    silhouette_scores = []
    db_scores = []
    ch_scores = []
    min_docs = []
    max_docs = []
    K = []

    for n_clusters in tqdm(n_clusters_range):
        
        gmm = GaussianMixture(n_components=n_clusters, random_state=random_state, max_iter = 500)
        gmm.fit(X)
        silhouette_scores.append(silhouette_score(X, gmm.predict(X)))
        db_scores.append(davies_bouldin_score(X, gmm.predict(X)))
        ch_scores.append(calinski_harabasz_score(X, gmm.predict(X)))

        # Plotting minimum documents per cluster as well

        doc_clusters = gmm.predict(X)
        centroids = gmm.means_
        doc_indices = {tuple(centroid): np.where(doc_clusters == i)[0].tolist() for i, centroid in enumerate(centroids)}

        centroid_mapping_svd = doc_indices
    
        min_docs_per_cluster = 9999
        max_docs_per_cluster = 0

        for i in range(n_clusters):

            min_docs_per_cluster = min(min_docs_per_cluster,(len(centroid_mapping_svd[list(centroid_mapping_svd.keys())[i]])))
            max_docs_per_cluster = max(max_docs_per_cluster,(len(centroid_mapping_svd[list(centroid_mapping_svd.keys())[i]])))

        min_docs.append(min_docs_per_cluster)
        max_docs.append(max_docs_per_cluster)
        K.append(n_clusters)

    fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(15, 15))

    silhouette_scores_norm = (silhouette_scores - np.min(silhouette_scores)) / (np.max(silhouette_scores) - np.min(silhouette_scores))
    db_scores_norm = (db_scores - np.min(db_scores)) / (np.max(db_scores) - np.min(db_scores))
    ch_scores_norm = (ch_scores - np.min(ch_scores)) / (np.max(ch_scores) - np.min(ch_scores))
    avg_scores = (silhouette_scores_norm + ch_scores_norm + (1 - db_scores_norm)) / 3

    ax[0, 0].plot(n_clusters_range, silhouette_scores, 'bo-')
    ax[0, 0].set_xlabel('Number of clusters')
    ax[0, 0].set_ylabel('Silhouette score')

    ax[0, 1].plot(n_clusters_range, db_scores, 'bo-')
    ax[0, 1].set_xlabel('Number of clusters')
    ax[0, 1].set_ylabel('Davies-Bouldin index')

    ax[1, 0].plot(n_clusters_range, ch_scores, 'bo-')
    ax[1, 0].set_xlabel('Number of clusters')
    ax[1, 0].set_ylabel('Calinski-Harabasz index')

    ax[1, 1].plot(n_clusters_range, avg_scores, 'bo-')
    ax[1, 1].set_xlabel('Number of clusters')
    ax[1, 1].set_ylabel('Average Score')

    ax[2, 0].plot(n_clusters_range, min_docs, 'bo-')
    ax[2, 0].set_xlabel('Number of clusters')
    ax[2, 0].set_ylabel('Minimum docs per cluster')

    ax[2, 1].plot(n_clusters_range, max_docs, 'bo-')
    ax[2, 1].set_xlabel('Number of clusters')
    ax[2, 1].set_ylabel('Maximum docs per cluster')

    plt.tight_layout()
    plt.show()

In [None]:
clustering(doc_embeddings_fasttext, 42, range(2,100))

In [None]:
clustering(doc_embeddings_word2vec, 42, range(2,20))

In [None]:
clustering(doc_embeddings_svd, 42, range(20,41))

In [None]:
from utilities import pickel_dictionary

pickel_dictionary(doc_embeddings_svd , 'pickel_dictionaries\centroid_mapping_svd.pkl', 30, 42)
pickel_dictionary(doc_embeddings_word2vec , 'pickel_dictionaries\centroid_mapping_word2vec.pkl', 15, 42)

In [None]:
from utilities import cluster_2d
cluster_2d(doc_embeddings_svd, 30, 42)

In [None]:
cluster_2d(doc_embeddings_word2vec, 15, 42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


all_texts = [' '.join(doc) for doc in docs] + [' '.join(query) for query in queries]

vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(all_texts)

doc_vectors = tfidf_vectors[:len(docs), :]
query_vectors = tfidf_vectors[len(docs):, :]

np.savetxt('cranfield_embeddings/doc_embeddings_vs.csv', doc_vectors.toarray(), fmt="%s")
np.savetxt('cranfield_embeddings/query_embeddings_vs.csv', query_vectors.toarray(), fmt="%s")

In [None]:
import numpy as np
from utilities import clustering
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
doc_embeddings_svd = ss.fit_transform(np.genfromtxt('cranfield_embeddings\doc_embeddings_svd.csv', dtype = float))

ss = StandardScaler()
doc_embeddings_word2vec = ss.fit_transform(np.genfromtxt('cranfield_embeddings\doc_embeddings_word2vec.csv', dtype = float))

ss = StandardScaler()
doc_embeddings_seq = ss.fit_transform(np.genfromtxt('cranfield_sequences\doc_seq_embedding.csv', dtype = float))

ss = StandardScaler()
doc_embeddings_seq_2 = ss.fit_transform(np.genfromtxt('cranfield_sequences\doc_seq_embedding_2.csv', dtype = float))

In [None]:
clustering(doc_embeddings_svd, 42, (2,10))