In [1]:
import ast
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
def preprocessor(text):
    """ This function is used to pre-process text by removing punctuations, stopwords, stemming and tokenizing the text"""

    stop_words = stopwords.words('english')
    stemmer = PorterStemmer()
    text = str(text)
    text = text.lower()
    strip_punctuation = str.maketrans('', '', string.punctuation)
    text = text.translate(strip_punctuation)
    text = word_tokenize(text)
    new_word = [stemmer.stem(word) for word in text if word not in stop_words]
    return new_word

In [3]:
def concat(*args):
    """This function helps to combine multiple strings together"""

    concatenated_text = ""
    for arg in args:
        concatenated_text += str(arg) + " "
    return concatenated_text

In [4]:
def document_preprocessor(doc):
    df = pd.read_csv(doc, encoding="utf-8")
    document = df.fillna('')
    document['words'] = document[['abstract', 'title', 'author']].apply(
        lambda x: concat(x['abstract'], x['title'], x['author']), axis=1)
    document['words'] = document['words'].apply(preprocessor)
    return document

In [6]:
def update_inverted_index(document_list):
    # Using the for loop to create an inverted index
    for i, doc in enumerate(document_list):
        for term in doc:
            if term in inverted_index:
                inverted_index[term].add(i)
            else:
                inverted_index[term] = {i}

    # Saving the inverted index into a txt file
    with open('output.txt', 'w', encoding="utf-8") as output:
        output.write(f"{inverted_index}")

In [7]:
with open('output.txt', encoding="utf-8") as file:
    content = file.read()

In [9]:
inverted_index = ast.literal_eval(content)
updated_document = document_preprocessor("update.csv")
updated_document_list = updated_document['words'].to_list()
documents = document_preprocessor("documents.csv")
list_of_documents = documents['words'].to_list()

In [10]:
updated_document_list

[['rethink', 'foundat', 'refuge', 'law', 'romit', 'bhandari'],
 ['grow',
  'global',
  'demand',
  'sustain',
  'environment',
  'friendli',
  'energi',
  'stimul',
  'rapid',
  'adopt',
  'renew',
  'offshor',
  'energi',
  'howev',
  'infrastructur',
  'need',
  'green',
  'energi',
  'product',
  'iron',
  'impact',
  'upon',
  'marin',
  'speci',
  'biodivers',
  'consequ',
  'posit',
  'review',
  'paper',
  'seek',
  'comprehens',
  'synthesis',
  'prevail',
  'bodi',
  'knowledg',
  'impact',
  'offshor',
  'energi',
  'develop',
  'broad',
  'rang',
  'marin',
  'speci',
  'interpretivist',
  'philosoph',
  'stanc',
  'induct',
  'reason',
  'adopt',
  'use',
  'scientometr',
  'analysi',
  'conduct',
  'rich',
  'synthesi',
  'extant',
  'literatur',
  'oper',
  'perspect',
  'scopu',
  'databas',
  'utilis',
  'search',
  'key',
  'term',
  'phenomena',
  'investig',
  'use',
  'vo',
  'viewer',
  'softwar',
  'identif',
  'trend',
  'use',
  'scientometr',
  'map',
  'analys

In [11]:
list_of_documents

[['rethink', 'foundat', 'refuge', 'law', 'romit', 'bhandari'],
 ['despit',
  'mani',
  'benefit',
  'cycl',
  'still',
  'widespread',
  'percept',
  'ride',
  'bicycl',
  'public',
  'road',
  'unsaf',
  'substanti',
  'increas',
  'cycl',
  'research',
  'past',
  'decad',
  'littl',
  'work',
  'explor',
  'challeng',
  'greater',
  'uptak',
  'cycl',
  'ridercent',
  'perspect',
  'explor',
  'research',
  'undertook',
  'larg',
  'intern',
  'survey',
  'experienc',
  'cyclist',
  'rider',
  'perspect',
  'explor',
  'use',
  'indepth',
  'process',
  'call',
  'critic',
  'decis',
  'method',
  'result',
  'reveal',
  'wide',
  'rang',
  'selfreport',
  'cycl',
  'experi',
  'respond',
  'classifi',
  'either',
  'strong',
  'fearless',
  'enthus',
  'confid',
  'actual',
  'differ',
  'respect',
  'threaten',
  'incid',
  'rider',
  'countermeasur',
  'present',
  'illustr',
  'overal',
  'similarli',
  'experienc',
  'cyclist',
  'respond',
  'threaten',
  'incid',
  'overarch'

In [17]:
list_difference = []
for element in updated_document_list:
    if element not in list_of_documents:
        list_difference.append(element)
        print(updated_document_list.index(element))

95
422
927
1943
2153
2162
2686
2881
3178
3237
3291
3516
3853
4127
4146
4387
5128
5164
7985
9328
9828
17312
19611
20724
23072
23916
24115
25939
26003
26089
26368
27061
28394
29076
29093
29581
29909
30340
30760
30789
30834
30885
30894
30964
30998


In [14]:
len(list_difference)

45

In [15]:
len(updated_document_list)


31025

In [16]:
len(list_of_documents)

31018

In [18]:
updated_document_list[95]

['purpos',
 'author',
 'investig',
 'whether',
 'individu',
 '“',
 'complet',
 'contribut',
 '”',
 'enabl',
 'onlin',
 'crowdfund',
 'campaign',
 'meet',
 'exceed',
 'target',
 'tend',
 'larger',
 'rel',
 'term',
 'made',
 'nearer',
 'fund',
 'deadlin',
 'contribut',
 'like',
 'disproportion',
 'impact',
 'upon',
 'campaign',
 'outcom',
 'author',
 'assess',
 'whether',
 'invest',
 'pattern',
 'observ',
 'consist',
 'theori',
 'impact',
 'philanthropi',
 'designmethodologyapproach',
 'author',
 'use',
 'campaignlevel',
 'data',
 'incorpor',
 'observ',
 'campaign',
 'reward',
 'allornoth',
 'aon',
 'keepital',
 'kia',
 'donat',
 'equitybas',
 'platform',
 'knowledg',
 'author',
 'coverag',
 'data',
 'unparallel',
 'elsewher',
 'crowdfund',
 'literatur',
 'use',
 'data',
 'author',
 'analyz',
 'whether',
 'complet',
 'contribut',
 'tend',
 'vari',
 'conting',
 'upon',
 'proxim',
 'deadlin',
 'form',
 'crowdfund',
 'find',
 'author',
 'find',
 'complet',
 'contribut',
 'tend',
 'vari',
 '

In [19]:
list_of_documents[95]

['long',
 'noncod',
 'rna',
 'lncrna',
 'class',
 'noncod',
 'rna',
 'play',
 'signific',
 'role',
 'sever',
 'biolog',
 'process',
 'accur',
 'identif',
 'subclassif',
 'lncrna',
 'crucial',
 'explor',
 'characterist',
 'function',
 'genom',
 'code',
 'potenti',
 'comput',
 'cpc',
 'tool',
 'fail',
 'accur',
 'identifi',
 'classifi',
 'predict',
 'biolog',
 'function',
 'plant',
 'speci',
 'studi',
 'novel',
 'comput',
 'framework',
 'call',
 'lncrna',
 'identif',
 'function',
 'predict',
 'tool',
 'lift',
 'develop',
 'implement',
 'least',
 'absolut',
 'shrinkag',
 'select',
 'oper',
 'lasso',
 'optimis',
 'iter',
 'random',
 'forest',
 'classif',
 'select',
 'optim',
 'featur',
 'novel',
 'positionbas',
 'classif',
 'pbc',
 'method',
 'subclassifi',
 'lncrna',
 'differ',
 'class',
 'bayesianbas',
 'function',
 'predict',
 'approach',
 'annot',
 'lncrna',
 'transcript',
 'use',
 'lasso',
 'lift',
 'select',
 '31',
 'optim',
 'featur',
 'achiev',
 '1530',
 'improv',
 'predict',
 'acc

In [20]:
data = pd.read_csv('update.csv')

In [21]:
data.to_csv('documents.csv')

In [22]:
documents = document_preprocessor("documents.csv")
list_of_documents = documents['words'].to_list()

In [23]:
list_difference = []
for element in updated_document_list:
    if element not in list_of_documents:
        list_difference.append(element)
        print(updated_document_list.index(element))

In [24]:
len(list_difference)

0