In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re

def clean_text(text):
    text = re.sub(r"\W+", " ", text.lower())
    text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess_text(text):
    if not text:
        return ""
    
    tokens = word_tokenize(text.lower()) 
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    tokens = [word for word in tokens if len(wordnet.synsets(word)) != 0]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)  

In [2]:
import json

with open("cranfield/cran_docs.json", "r") as f:
    cranfield = json.load(f)
    f.close()

docs = {data['id']:preprocess_text(clean_text(data['body'])) for data in cranfield}

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=int(0.05*len(docs)))
vectorized_docs = vectorizer.fit_transform(list(docs.values()))
feature_names = vectorizer.get_feature_names_out()

In [29]:
import pandas as pd

relevance_matrix = pd.DataFrame(vectorized_docs.toarray(), columns=feature_names)

In [30]:
relevance_matrix

Unnamed: 0,account,accuracy,aerodynamic,agreement,air,aircraft,along,also,analysis,angle,...,velocity,viscous,wall,wave,well,wind,wing,within,work,zero
0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.106986,...,0.091967,0.000000,0.0,0.000000,0.115602,0.0,0.314344,0.0,0.0,0.000000
1,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.173953,0.0,0.146606,0.000000,0.0,0.000000,0.0,0.0,0.000000
2,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
3,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.101934,0.000000,0.000000,...,0.100769,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.169776,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0.0,0.0,0.000000,0.153576,0.0,0.0,0.167553,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
1396,0.0,0.0,0.000000,0.156934,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.171591
1397,0.0,0.0,0.000000,0.000000,0.0,0.0,0.069438,0.052567,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
1398,0.0,0.0,0.000000,0.000000,0.0,0.0,0.174196,0.000000,0.134843,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000


In [31]:
def wordnet_similarity(word1, word2):
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)
    
    sims = (
        s1.path_similarity(s2)
        for s1 in synsets1
        for s2 in synsets2
    )
    
    max_sim = max((sim for sim in sims if sim is not None), default=None)
    return max_sim

In [32]:
similarity_matrix = []

for i in range(len(feature_names)):
    w1 = feature_names[i]
    print(w1)
    row = [0]*i
    for j in range(i, len(feature_names)):
        w2 = feature_names[j]
        sim = wordnet_similarity(w1, w2)
        row.append(sim)
    similarity_matrix.append(row)

account
accuracy
aerodynamic
agreement
air
aircraft
along
also
analysis
angle
applicable
application
applied
approach
approximate
approximately
approximation
arbitrary
aspect
assumed
assumption
attack
author
available
axial
based
blunt
body
boundary
buckling
calculated
calculation
case
certain
change
characteristic
circular
coefficient
compared
comparison
compressible
condition
cone
configuration
consideration
considered
constant
corresponding
critical
curve
cylinder
cylindrical
data
deflection
density
derived
described
design
detail
determine
determined
developed
development
diameter
difference
different
differential
dimensional
direction
discussed
displacement
distance
distribution
drag
due
dynamic
edge
effect
either
elastic
energy
equation
equilibrium
exact
example
expansion
experiment
experimental
expression
external
factor
field
finite
first
flat
flight
flow
fluid
force
form
formula
found
free
friction
function
gas
general
give
given
good
gradient
heat
heating
high
higher
however


In [33]:
similarity_matrix = pd.DataFrame(similarity_matrix, index=feature_names, columns=feature_names)

In [34]:
similarity_matrix

Unnamed: 0,account,accuracy,aerodynamic,agreement,air,aircraft,along,also,analysis,angle,...,velocity,viscous,wall,wave,well,wind,wing,within,work,zero
account,1.0,0.166667,0.333333,0.333333,0.333333,0.083333,0.333333,0.333333,0.166667,0.250000,...,0.125000,0.333333,0.166667,0.250000,0.333333,0.333333,0.250000,0.333333,0.333333,0.333333
accuracy,0.0,1.000000,0.142857,0.200000,0.333333,0.071429,0.142857,0.142857,0.142857,0.142857,...,0.125000,0.142857,0.142857,0.142857,0.142857,0.125000,0.142857,0.142857,0.142857,0.142857
aerodynamic,0.0,0.000000,1.000000,0.125000,0.333333,0.083333,0.333333,0.333333,0.142857,0.250000,...,0.125000,0.333333,0.166667,0.250000,0.333333,0.250000,0.250000,0.333333,0.333333,0.333333
agreement,0.0,0.000000,0.000000,1.000000,0.200000,0.066667,0.125000,0.125000,0.200000,0.111111,...,0.142857,0.125000,0.142857,0.142857,0.125000,0.200000,0.125000,0.125000,0.200000,0.125000
air,0.0,0.000000,0.000000,0.000000,1.000000,0.090909,0.333333,0.333333,0.166667,0.250000,...,0.125000,0.333333,0.200000,0.250000,0.333333,0.500000,0.250000,0.333333,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wind,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.333333,0.250000,0.333333,0.250000
wing,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.250000,0.250000,0.250000
within,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.333333,0.333333
work,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.333333


In [35]:
relevance_matrix.index = [x+1 for x in relevance_matrix.index]

In [38]:
similarity_matrix.to_csv("wordnet_similarities.csv")

In [109]:
similarity_matrix = pd.read_csv("wordnet_similarities.csv").set_index("Unnamed: 0")

In [110]:
import numpy as np

similarity_matrix_symmetric = similarity_matrix.T + similarity_matrix
np.fill_diagonal(similarity_matrix_symmetric.values, 1)
similarity_matrix_symmetric

Unnamed: 0,account,accuracy,aerodynamic,agreement,air,aircraft,along,also,analysis,angle,...,velocity,viscous,wall,wave,well,wind,wing,within,work,zero
account,1.000000,0.166667,0.333333,0.333333,0.333333,0.083333,0.333333,0.333333,0.166667,0.250000,...,0.125000,0.333333,0.166667,0.250000,0.333333,0.333333,0.250000,0.333333,0.333333,0.333333
accuracy,0.166667,1.000000,0.142857,0.200000,0.333333,0.071429,0.142857,0.142857,0.142857,0.142857,...,0.125000,0.142857,0.142857,0.142857,0.142857,0.125000,0.142857,0.142857,0.142857,0.142857
aerodynamic,0.333333,0.142857,1.000000,0.125000,0.333333,0.083333,0.333333,0.333333,0.142857,0.250000,...,0.125000,0.333333,0.166667,0.250000,0.333333,0.250000,0.250000,0.333333,0.333333,0.333333
agreement,0.333333,0.200000,0.125000,1.000000,0.200000,0.066667,0.125000,0.125000,0.200000,0.111111,...,0.142857,0.125000,0.142857,0.142857,0.125000,0.200000,0.125000,0.125000,0.200000,0.125000
air,0.333333,0.333333,0.333333,0.200000,1.000000,0.090909,0.333333,0.333333,0.166667,0.250000,...,0.125000,0.333333,0.200000,0.250000,0.333333,0.500000,0.250000,0.333333,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wind,0.333333,0.125000,0.250000,0.200000,0.500000,0.125000,0.250000,0.250000,0.166667,0.333333,...,0.111111,0.250000,0.142857,0.333333,0.250000,1.000000,0.333333,0.250000,0.333333,0.250000
wing,0.250000,0.142857,0.250000,0.125000,0.250000,0.125000,0.250000,0.250000,0.142857,0.333333,...,0.125000,0.250000,0.250000,0.200000,0.250000,0.333333,1.000000,0.250000,0.250000,0.250000
within,0.333333,0.142857,0.333333,0.125000,0.333333,0.083333,0.333333,0.333333,0.142857,0.250000,...,0.125000,0.333333,0.166667,0.250000,0.333333,0.250000,0.250000,1.000000,0.333333,0.333333
work,0.333333,0.142857,0.333333,0.200000,0.333333,0.111111,0.333333,0.333333,0.333333,0.250000,...,0.125000,0.333333,0.166667,0.333333,0.333333,0.333333,0.250000,0.333333,1.000000,0.333333


In [105]:
# from collections import defaultdict

# def spreading_activation_ir(query):
    
#     query = preprocess_text(clean_text(query))
#     initial_activations = list(vectorizer.transform([query]).toarray().reshape(1,-1)[0])
#     initial_activations = dict(zip(feature_names, initial_activations))
#     non_zero_activations = [word for word, activation in initial_activations.items() if activation != 0]
    
#     final_activations = defaultdict()
#     for w1 in initial_activations.keys():
#         activation = initial_activations[w1]
#         for w2 in non_zero_activations:
#             if w1 < w2:
#                 similarity_score = similarity_matrix.loc[w1][w2]
#             else:
#                 similarity_score = similarity_matrix.loc[w2][w1]
#             activation += initial_activations[w2] * similarity_score
#         final_activations[w1] = activation
    
#     document_activations = {doc_id:0 for doc_id in relevance_matrix.index}
#     for document in document_activations.keys():
#         activation = document_activations[document]
#         for word in final_activations.keys():
#             activation += final_activations[word] * relevance_matrix.loc[document][word]
#         document_activations[document] = activation
    
#     sorted_doc_ids = [doc_id for doc_id, _ in sorted(document_activations.items(), key=lambda item: item[1], reverse=True)]
        
#     return sorted_doc_ids

In [111]:
def spreading_activation_ir(query):
    query = preprocess_text(clean_text(query))
        
    initial_vec = vectorizer.transform([query]).toarray().reshape(-1)
    initial_series = pd.Series(initial_vec, index=feature_names)
    
    non_zero_words = initial_series[initial_series != 0].index.tolist()
    
    similarity_submatrix = similarity_matrix_symmetric.loc[feature_names, non_zero_words]
    
    spreading_part = similarity_submatrix @ initial_series[non_zero_words]
    
    final_series = initial_series + spreading_part
    
    document_activations_series = (relevance_matrix.fillna(0)) @ final_series
    
    sorted_doc_ids = document_activations_series.sort_values(ascending=False).index.tolist()
    
    return sorted_doc_ids

In [113]:
queries_json = json.load(open("cranfield/cran_queries.json", 'r'))
qrels = json.load(open("cranfield/cran_qrels.json", 'r'))

In [114]:
query_ids, queries = [item["query number"] for item in queries_json], [item["query"] for item in queries_json]

In [115]:
from evaluation import Evaluation

evaluator = Evaluation()

In [116]:
doc_IDs_ordered = [spreading_activation_ir(query) for query in queries]

precisions, recalls, fscores, MAPs, nDCGs = [], [], [], [], []

for k in range(1, 11):
    precision = evaluator.meanPrecision(doc_IDs_ordered, query_ids, qrels, k)
    precisions.append(precision)
    recall = evaluator.meanRecall(doc_IDs_ordered, query_ids, qrels, k)
    recalls.append(recall)
    fscore = evaluator.meanFscore(doc_IDs_ordered, query_ids, qrels, k)
    fscores.append(fscore)
    print("Precision, Recall and F-score @ " + str(k) + " : " + str(precision) + ", " + str(recall) + ", " + str(fscore))
    MAP = evaluator.meanAveragePrecision(doc_IDs_ordered, query_ids, qrels, k)
    MAPs.append(MAP)
    nDCG = evaluator.meanNDCG(doc_IDs_ordered, query_ids, qrels, k)
    nDCGs.append(nDCG)
    print("MAP, nDCG @ " + str(k) + " : " + str(MAP) + ", " + str(nDCG))

Precision, Recall and F-score @ 1 : 0.044444444444444446, 0.003582781416114749, 0.006556416801753962
MAP, nDCG @ 1 : 0.003582781416114749, 0.044444444444444446
Precision, Recall and F-score @ 2 : 0.044444444444444446, 0.00735296933802681, 0.01230224844292257
MAP, nDCG @ 2 : 0.005523430932626335, 0.06968163458730275
Precision, Recall and F-score @ 3 : 0.04, 0.010806552791610262, 0.01650860966538841
MAP, nDCG @ 3 : 0.006674625417154153, 0.08523719014285831
Precision, Recall and F-score @ 4 : 0.04, 0.016855935507659642, 0.02218671247550647
MAP, nDCG @ 4 : 0.008437412013274082, 0.09594444213428843
Precision, Recall and F-score @ 5 : 0.04, 0.021238795204244834, 0.02571471990151041
MAP, nDCG @ 5 : 0.009783065488339323, 0.10387191246427348
Precision, Recall and F-score @ 6 : 0.037037037037037035, 0.023812147777597405, 0.02681812972502033
MAP, nDCG @ 6 : 0.010381590420197588, 0.10584647235643439
Precision, Recall and F-score @ 7 : 0.03619047619047619, 0.026806664372114, 0.028546537223627676
MA