In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re

def clean_text(text):
    text = re.sub(r"\W+", " ", text.lower())
    text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess_text(text):
    if not text:
        return ""
    
    tokens = word_tokenize(text.lower()) 
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    tokens = [word for word in tokens if len(wordnet.synsets(word)) != 0]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)  

In [2]:
import json

with open("cranfield/cran_docs.json", "r") as f:
    cranfield = json.load(f)
    f.close()

docs = {data['id']:preprocess_text(clean_text(data['body'])) for data in cranfield}

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=int(0.05*len(docs)))
vectorized_docs = vectorizer.fit_transform(list(docs.values()))
feature_names = vectorizer.get_feature_names_out()

In [29]:
import pandas as pd

relevance_matrix = pd.DataFrame(vectorized_docs.toarray(), columns=feature_names)

In [30]:
relevance_matrix

Unnamed: 0,account,accuracy,aerodynamic,agreement,air,aircraft,along,also,analysis,angle,...,velocity,viscous,wall,wave,well,wind,wing,within,work,zero
0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.106986,...,0.091967,0.000000,0.0,0.000000,0.115602,0.0,0.314344,0.0,0.0,0.000000
1,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.173953,0.0,0.146606,0.000000,0.0,0.000000,0.0,0.0,0.000000
2,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
3,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.101934,0.000000,0.000000,...,0.100769,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.169776,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0.0,0.0,0.000000,0.153576,0.0,0.0,0.167553,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
1396,0.0,0.0,0.000000,0.156934,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.171591
1397,0.0,0.0,0.000000,0.000000,0.0,0.0,0.069438,0.052567,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
1398,0.0,0.0,0.000000,0.000000,0.0,0.0,0.174196,0.000000,0.134843,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000


In [31]:
def wordnet_similarity(word1, word2):
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)
    
    sims = (
        s1.path_similarity(s2)
        for s1 in synsets1
        for s2 in synsets2
    )
    
    max_sim = max((sim for sim in sims if sim is not None), default=None)
    return max_sim

In [32]:
similarity_matrix = []

for i in range(len(feature_names)):
    w1 = feature_names[i]
    print(w1)
    row = [0]*i
    for j in range(i, len(feature_names)):
        w2 = feature_names[j]
        sim = wordnet_similarity(w1, w2)
        row.append(sim)
    similarity_matrix.append(row)

account
accuracy
aerodynamic
agreement
air
aircraft
along
also
analysis
angle
applicable
application
applied
approach
approximate
approximately
approximation
arbitrary
aspect
assumed
assumption
attack
author
available
axial
based
blunt
body
boundary
buckling
calculated
calculation
case
certain
change
characteristic
circular
coefficient
compared
comparison
compressible
condition
cone
configuration
consideration
considered
constant
corresponding
critical
curve
cylinder
cylindrical
data
deflection
density
derived
described
design
detail
determine
determined
developed
development
diameter
difference
different
differential
dimensional
direction
discussed
displacement
distance
distribution
drag
due
dynamic
edge
effect
either
elastic
energy
equation
equilibrium
exact
example
expansion
experiment
experimental
expression
external
factor
field
finite
first
flat
flight
flow
fluid
force
form
formula
found
free
friction
function
gas
general
give
given
good
gradient
heat
heating
high
higher
however


In [33]:
similarity_matrix = pd.DataFrame(similarity_matrix, index=feature_names, columns=feature_names)

In [34]:
similarity_matrix

Unnamed: 0,account,accuracy,aerodynamic,agreement,air,aircraft,along,also,analysis,angle,...,velocity,viscous,wall,wave,well,wind,wing,within,work,zero
account,1.0,0.166667,0.333333,0.333333,0.333333,0.083333,0.333333,0.333333,0.166667,0.250000,...,0.125000,0.333333,0.166667,0.250000,0.333333,0.333333,0.250000,0.333333,0.333333,0.333333
accuracy,0.0,1.000000,0.142857,0.200000,0.333333,0.071429,0.142857,0.142857,0.142857,0.142857,...,0.125000,0.142857,0.142857,0.142857,0.142857,0.125000,0.142857,0.142857,0.142857,0.142857
aerodynamic,0.0,0.000000,1.000000,0.125000,0.333333,0.083333,0.333333,0.333333,0.142857,0.250000,...,0.125000,0.333333,0.166667,0.250000,0.333333,0.250000,0.250000,0.333333,0.333333,0.333333
agreement,0.0,0.000000,0.000000,1.000000,0.200000,0.066667,0.125000,0.125000,0.200000,0.111111,...,0.142857,0.125000,0.142857,0.142857,0.125000,0.200000,0.125000,0.125000,0.200000,0.125000
air,0.0,0.000000,0.000000,0.000000,1.000000,0.090909,0.333333,0.333333,0.166667,0.250000,...,0.125000,0.333333,0.200000,0.250000,0.333333,0.500000,0.250000,0.333333,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wind,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.333333,0.250000,0.333333,0.250000
wing,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.250000,0.250000,0.250000
within,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.333333,0.333333
work,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.333333


In [35]:
relevance_matrix.index = [x+1 for x in relevance_matrix.index]

In [37]:
similarity_matrix.to_csv("wordnet_similarities.csv")