In [1]:
import numpy as np
import sklearn as skl
import csv
import nltk
import pandas as pd

In [2]:
def pourcentage_similarity(text1: str, text2: str):
    '''
    Function used to express similarity between two strings
    '''
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words1 = nltk.tokenize.word_tokenize(text1)
    words2 = nltk.tokenize.word_tokenize(text2)

    stemmer = nltk.stem.PorterStemmer()

    new_sentence1 = []
    new_sentence2 = []

    for word in words1:
        if word not in stop_words:
            new_sentence1.append(stemmer.stem(word))

    for word in words2:
        if word not in stop_words:
            new_sentence2.append(stemmer.stem(word))
    
    set1 = set(new_sentence1)
    set2 = set(new_sentence2)
    inter = set1.intersection(set2)
    
    return len(inter) / max(len(set1), len(set2))

In [3]:
node_inf_raw = pd.read_csv("./node_information.csv")
node_inf = node_inf_raw.values
for i in range(len(node_inf)):
    if type(node_inf[i][3]) == float:
        node_inf[i][3] = []
    else:
        node_inf[i][3] = node_inf[i][3].split(", ")
    
    if type(node_inf[i][4]) == float:
        node_inf[i][4] = ''

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
# compute TFIDF vector of each paper
stemmer = nltk.stem.PorterStemmer()
corpus = [' '.join([stemmer.stem(a) for a in nltk.tokenize.word_tokenize(element[5])]) for element in node_inf]
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(corpus)

In [5]:
np.array([(a, vectorizer.get_feature_names()[j]) for j,a in enumerate([features_TFIDF[0,i] for i in range(19155)]) if a != 0.0])

array([['0.13688111128682906', 'attent'],
       ['0.08436599717492181', 'base'],
       ['0.11147182590838096', 'calabi'],
       ['0.12728624340033676', 'case'],
       ['0.09954676368861036', 'compactif'],
       ['0.21213108193839983', 'compactifi'],
       ['0.0673234645918907', 'consid'],
       ['0.10137106310287669', 'current'],
       ['0.08026433021458468', 'differ'],
       ['0.06461814715134687', 'dimens'],
       ['0.05854793091052995', 'discuss'],
       ['0.08318396443734093', 'exist'],
       ['0.08609793558213376', 'geometri'],
       ['0.07820564378508763', 'given'],
       ['0.10518541031065444', 'heterot'],
       ['0.3928312136226238', 'hypermultiplet'],
       ['0.1125887849517985', 'iia'],
       ['0.10359050035090166', 'iib'],
       ['0.10009008961862523', 'instanton'],
       ['0.2508273480981552', 'k3xt2'],
       ['0.1308665876556487', 'lectur'],
       ['0.0716148980114334', 'limit'],
       ['0.12514324371806376', 'mix'],
       ['0.2826577236716669', 'mod

In [6]:
ex = (features_TFIDF[0].multiply(features_TFIDF[1]))

In [20]:
ex2 = features_TFIDF[0].dot(features_TFIDF[4].T)

In [21]:
float(ex2.data)

0.0774826131202842

In [22]:
features_TFIDF

<27770x19155 sparse matrix of type '<class 'numpy.float64'>'
	with 1199139 stored elements in Compressed Sparse Row format>

In [24]:
len(corpus)

27770