In [1]:
import nltk
import numpy as np
from fuzzywuzzy import fuzz

from collections import defaultdict
from numpy.linalg import norm
from numpy import dot
from scipy import stats

In [2]:
import os

#download tokenizer
current_directory = os.getcwd()
path=current_directory+'/nltk_data'
nltk.download('punkt_tab',download_dir=path)
nltk.download('wordnet',download_dir=path)
nltk.download('omw-1.4',download_dir=path)
nltk.data.path.append(path)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/azureuser/npl2024/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/npl2024/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/azureuser/npl2024/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## 2)

In [3]:
#read files
with open('resources/abstracts3.txt','r') as f:
    data=f.read().lower().strip()

punctuation=['"','!','.',',',"'",'(',')',';','``',"''",'?','_',':','-']
punctuation=[]

data=data.split('-next-')
data=[ab.strip().split('\n')[:] for ab in data][:]

#remove punctuation from abstracts
for punct in punctuation:
    for i in range(len(data)):
        data[i][2]=data[i][2].replace(punct,'')

#split authors and keywords
for i in range(len(data)):
    data[i][1]=data[i][1].split(',')
    data[i][3]=data[i][3].split(',')
data

[['machine learning and its applications for plasmonics in biology',
  ['gwiyeong moon',
   ' jongha lee',
   ' hyunwoong lee',
   ' hajun yoo',
   ' kwanhwi ko',
   ' seongmin im',
   ' donghyun kim'],
  'machine learning (ml) has drawn tremendous interest for its capacity to extract useful information that may be overlooked with conventional analysis techniques and for its versatility in a wide range of research domains, including biomedical sensing and imaging. in this perspective, we provide an overview focused on the uses and benefits of ml in areas of plasmonics in biology. ml methodologies for processing data from plasmonic biosensing and imaging systems by supervised and unsupervised learning to achieve enhanced detection and quantification of target analytes are described. in addition, deep learning-based approaches to improve the design of plasmonic structures are presented. data analysis based on ml for classification, regression, and clustering by dimension reduction is pre

In [4]:
len(data)

30

In [5]:
#custom dictionary class to conveniently add new items
class inverted_index_dict:
    def __init__(self) -> None:
        self.dict=defaultdict()

    def add_item(self,token,doc):
        if self.dict.__contains__(token):
            if doc not in self.dict[token]:
                self.dict[token].append(doc)
        else:
            self.dict[token]=[doc]

inverted_index=inverted_index_dict()

for j,ab in enumerate(data):
    for i,kw in enumerate(ab[3]):
        inverted_index.add_item(kw.strip(),j)

inverted_index.dict

defaultdict(None,
            {'machine learning': [0,
              1,
              2,
              4,
              5,
              6,
              7,
              8,
              9,
              10,
              11,
              14,
              15,
              16,
              17,
              21,
              23,
              24,
              25,
              26,
              27,
              28],
             'plasmonics': [0],
             'data analysis': [0],
             'structure design': [0],
             'deep learning': [0, 4, 8, 10, 13, 18],
             'biosensors': [0],
             'imaging': [0],
             'mathematical modeling': [1],
             'reinforcement learning': [1],
             'systems biology': [1],
             'simulation': [1],
             'systematic literature review': [1],
             'healthcare analytics': [2],
             'artificial intelligence': [2, 4, 17, 21],
             'medical research': [2],
             

In [6]:
#ordering
kk=list(inverted_index.dict.items())
kk=sorted(kk,key=lambda x: x[0])
kk

[('adaptive mpc', [20]),
 ('additive manufacturing', [5]),
 ('ai', [10]),
 ('algorithms', [2]),
 ('allostery', [9]),
 ('analytical threshold', [10]),
 ('ann', [10]),
 ('antibodies', [25]),
 ('artificial intelligence', [2, 4, 17, 21]),
 ('artificial neural networks', [10]),
 ('at', [10]),
 ('atomic simulation', [28]),
 ('battery cell manufacturing', [14]),
 ('bayesian networks', [10]),
 ('bayesian optimization', [14]),
 ('bibliometric analysis', [2, 26]),
 ('big data', [5]),
 ('biopharmaceuticals', [23]),
 ('bioprocesses', [23]),
 ('biosensing system', [6]),
 ('biosensors', [0]),
 ('biosensors integration', [6]),
 ('bn', [10]),
 ('cahn-hilliard model', [3]),
 ('capillary electrophoresis', [10]),
 ('carbon quantum dot', [7]),
 ('carbon quantum dots', [19]),
 ('cart', [10]),
 ('catalysis', [28]),
 ('ce', [10]),
 ('chaos', [13]),
 ('cheminformatics', [18]),
 ('chronic lymphocytic leukemia', [21]),
 ('circular economy', [24]),
 ('classification', [13, 29]),
 ('classification and regression 

In [7]:
'''
as suggested by the TA, if we have multiple words as keyword (e.g. artificial intelligence),
we split the keyword as 'artificial' and 'intelligence'
'''
#concatenate keywords
concatenated_keywords=set([word.strip() for i,ab in enumerate(data) for word in ab[3]])
T='Advancement in Physic and Biology Science by Machine learning'
output='T: artificial intelligence\n'
#flatten each word in keyword
keywords=[k.split() for k in concatenated_keywords]
keywords=[p for x in keywords for p in x]
keywords=set(keywords)
X=[[k,0] for k in keywords]

for i,kw in enumerate(keywords):
    simil=min(nltk.distance.edit_distance('artificial',kw),nltk.distance.edit_distance('intelligence',kw))
    X[i][1]=simil
    output+=f'{kw}: {simil}\n'
X=sorted(X,key=lambda x: x[0])
print(output)

T: artificial intelligence
software: 9
biology: 9
vector: 9
photoelectrochemical: 15
failure: 9
neighbours: 10
catalysis: 8
analysis: 8
applications: 9
analytics: 8
cell: 8
human: 9
sensors: 10
engineering: 9
functional: 7
density-based: 11
artificial: 0
advancements: 10
neighbour: 9
development: 8
logistic: 8
predictive: 7
optimization: 8
thermal: 7
cahn-hilliard: 9
nucleotide: 9
extrapolation: 11
electrochemical: 11
microalloying: 10
intelligent: 2
discovery: 10
heavy: 10
t-sne: 9
encapsulation: 11
extreme: 9
manufacturing: 10
synthesis: 9
biopharmaceuticals: 13
chemistry: 9
supply: 10
spatial: 7
values: 10
data-driven: 9
tree: 9
maximum: 9
pgs: 10
microfluidics: 10
machine: 9
relation: 8
learning-based: 10
horizon: 9
controlled: 9
hid: 9
pg: 10
energy: 9
battery: 9
model: 9
identification: 8
lymphocytic: 10
reinforcement: 10
k-nearest: 10
language: 9
electropherogram: 14
classification: 8
deep: 10
repeat: 8
numerical: 6
strength: 9
models: 10
dynamics: 8
economy: 10
allostery: 9
sof

In [8]:
print(X)
len(X),len(keywords)

[['adaptive', 8], ['additive', 7], ['advancements', 10], ['adversarial', 7], ['affinity', 6], ['ai', 8], ['algorithms', 8], ['allele', 8], ['allostery', 9], ['alloy', 9], ['analysis', 8], ['analytical', 6], ['analytics', 8], ['and', 9], ['ann', 9], ['antibodies', 6], ['applications', 9], ['artificial', 0], ['at', 8], ['atomic', 6], ['batteries', 8], ['battery', 9], ['bayes', 10], ['bayesian', 8], ['bibliometric', 11], ['big', 9], ['binding', 8], ['biology', 9], ['biopharmaceuticals', 13], ['bioprocesses', 11], ['biosensing', 9], ['biosensors', 10], ['bn', 10], ['cahn-hilliard', 9], ['calculations', 10], ['capillary', 9], ['carbon', 9], ['carlo', 9], ['cart', 8], ['catalysis', 8], ['ce', 9], ['cell', 8], ['chain', 9], ['chains', 9], ['change', 10], ['chaos', 9], ['cheminformatics', 12], ['chemistry', 9], ['chronic', 8], ['circular', 8], ['classification', 8], ['clean', 9], ['climate', 9], ['clustering', 9], ['cnn', 9], ['component', 10], ['computational', 9], ['configuration', 11], ['co

(303, 303)

## 3)

In [9]:
Y=[]
concatenated_keywords=set([word.strip() for i,ab in enumerate(data) for word in ab[3]])

split_kk=[[k[0].split(),k[1]] for k in kk]
for kws,ids in split_kk:
    for kw in kws:
        for id in ids:
            simils=[]
            for word in data[id][0].split():
                simils.append(nltk.distance.edit_distance(word,kw))
            simil=min(simils)
            Y.append([kw,id,simil])
#each element: keyword, document it appears in, edit distance with the title of that document

In [10]:
Y=sorted(Y,key=lambda x: x[0])
i=0
new_Y=[]
w=Y[0][0]
distances=[]
while i<len(Y):
    if Y[i][0]!=w:
        new_Y.append([w,min(distances)])
        w=Y[i][0]
        distances=[Y[i][2]]
    else:
        distances.append(Y[i][2])
    i+=1
new_Y.append([w,min(distances)])
new_Y

[['adaptive', 6],
 ['additive', 0],
 ['advancements', 9],
 ['adversarial', 8],
 ['affinity', 0],
 ['ai', 1],
 ['algorithms', 7],
 ['allele', 5],
 ['allostery', 2],
 ['alloy', 0],
 ['analysis', 0],
 ['analytical', 5],
 ['analytics', 0],
 ['and', 2],
 ['ann', 2],
 ['antibodies', 3],
 ['applications', 0],
 ['artificial', 4],
 ['at', 1],
 ['atomic', 5],
 ['batteries', 3],
 ['battery', 0],
 ['bayes', 4],
 ['bayesian', 6],
 ['bibliometric', 0],
 ['big', 0],
 ['binding', 5],
 ['biology', 0],
 ['biopharmaceuticals', 12],
 ['bioprocesses', 5],
 ['biosensing', 3],
 ['biosensors', 0],
 ['bn', 1],
 ['cahn-hilliard', 10],
 ['calculations', 8],
 ['capillary', 7],
 ['carbon', 0],
 ['carlo', 4],
 ['cart', 3],
 ['catalysis', 6],
 ['ce', 2],
 ['cell', 1],
 ['chain', 3],
 ['chains', 4],
 ['change', 0],
 ['chaos', 4],
 ['cheminformatics', 11],
 ['chemistry', 1],
 ['chronic', 0],
 ['circular', 0],
 ['classification', 0],
 ['clean', 4],
 ['climate', 0],
 ['clustering', 0],
 ['cnn', 2],
 ['component', 7],
 [

## 4)

In [11]:
npX=np.array([x[1] for x in X])
npY=np.array([y[1] for y in new_Y])
stats.pearsonr(npX,npY)

PearsonRResult(statistic=0.16754636796107703, pvalue=0.0034433203496025537)

## 5)

In [12]:
#get all unique keywords
concatenated_keywords=set([word.strip() for i,ab in enumerate(data) for word in ab[3]])
keywords=[k.split() for k in concatenated_keywords]
keywords=[p for x in keywords for p in x]
keywords=list(set(keywords))
M=np.zeros((len(data),len(keywords)))

for i,kw in enumerate(keywords):
    for j,ab in enumerate(data):
        abstract=data[j][2]
        num=abstract.count(kw)
        M[j,i]=num

M

array([[0., 2., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## 6)

In [14]:
def boolean_matching(query,db=M,kws=keywords):
    qkeys=query.split()
    result=np.ones(len(data))
    for qkey in qkeys:
        i=kws.index(qkey)
        doc_res=db[:,i]
        doc_res[doc_res>0]=1
        result=np.logical_and(result,doc_res)
    return result

query='Advancement in Physic and Biology Science by Machine learning'
query=query.lower()
boolean_matching(query)

ValueError: 'advancement' is not in list

## 7)

In [15]:
n=len(data)
df=np.zeros(len(keywords))
for i,col in enumerate(M.T):
    df[i]=len(col[col>0])

idf=np.zeros(len(keywords))
for i,f in enumerate(df):
    if df[i]==0:
        idf[i]=0
    else:
        idf[i]=np.log10(n/f)


tfidf_M=M*idf

def tfidf_query_indexer(query,tfidf=tfidf_M,kws=keywords,df=df):
    qkeys=query.split()
    indexed_query=np.zeros(len(keywords))

    for qkey in qkeys:
        if qkey not in keywords:
            continue
        i=kws.index(qkey)
        tf=qkeys.count(qkey)
        idf=np.log10(n/df[i])

        indexed_query[i]=tf*idf
    return indexed_query



query='Advancement in Physic and Biology Science by Machine learning'
indexed_query=tfidf_query_indexer(query)


In [16]:
similiraty_scores=[]*len(data)

for row in tfidf_M:
    sim_score=dot(indexed_query,row) / (norm(indexed_query)*norm(row))
    similiraty_scores.append(sim_score)

print(similiraty_scores)

print(f"Most similar document : {np.argmax(similiraty_scores)}")


[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
Most similar document : 0


  sim_score=dot(indexed_query,row) / (norm(indexed_query)*norm(row))


# 8

In [17]:
Z = []*len(data)

similarity_fw_abs_title = 0

for doc in data:
    title=doc[0]
    abstract=doc[2]
    ratio = fuzz.partial_ratio(title, abstract)
    if ratio > 80:
        similarity_fw_abs_title +=1
    Z.append(ratio)

print(Z)

print(f"There are {similarity_fw_abs_title} accepted matches with fuzzywuzzy on {len(data)} documents")


[65, 51, 74, 52, 58, 74, 57, 72, 54, 66, 61, 54, 50, 59, 52, 55, 61, 61, 59, 56, 53, 51, 59, 59, 52, 64, 46, 55, 56, 83]
There are 1 accepted matches with fuzzywuzzy on 30 documents


We see that all our documents' title don't match precisely with the content of the corresponding abstract. In general, with our data, the words present in title don't often appear in the abstract.
To get more matches, we could lower the threshold to 70%, and we would get 3 matches, but this only depends on the data you select. 