# Initialization

Import Dependencies

In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import nltk
import math
import operator
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
import string
import json
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
np.random.seed(2018)
nltk.download('wordnet')
import random

Mounted at /content/drive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Dataset and Dataframes

In [8]:
#json file has complete data with column attributes
dataset = pd.read_csv('/content/drive/MyDrive/AI Assignment/dataset/journals_26800.csv', thousands=",")

headlines = dataset['Title'].to_numpy()
short_desc = dataset['Scope'].to_numpy()

print(headlines[781])
print(short_desc[781])

#total docs actually present in corpus
print(len(dataset))
total_reviews = len(dataset)

JAMA Ophthalmology
JAMA Ophthalmology, published continuously since 1869, is an international, peer-reviewed ophthalmology and visual science journal. In 2019, the journal celebrates 150 years of continuous publication. JAMA Ophthalmology is a member of the JAMA Network, a consortium of peer-reviewed, general medical and specialty publications.
26800


# Preprocessing data

Stemming and tokenization

In [3]:
'''
Preprocessing function - Porter stemmer
'''
def preprocess_docs(data):
    processed_data=[]
    stemmer = PorterStemmer()
    for d in data:
        tempf=[]
        temp = str(d)
        temp = ''.join(c for c in temp if c not in string.punctuation)
        temp = nltk.word_tokenize(temp)
        temp = [w.lower() for w in temp]
        temp = [t for t in temp if t not in nltk.corpus.stopwords.words('english')]
        for word in temp:
          tempf.append(stemmer.stem(word))
        processed_data.append(tempf)
    return processed_data

In [7]:

headlines = preprocess_docs(headlines)
short_desc = preprocess_docs(short_desc)

print(headlines[781])
print(short_desc[781])

['jama', 'ophthalmolog']
['jama', 'ophthalmolog', 'publish', 'continu', 'sinc', '1869', 'intern', 'peerreview', 'ophthalmolog', 'visual', 'scienc', 'journal', '2019', 'journal', 'celebr', '150', 'year', 'continu', 'public', 'jama', 'ophthalmolog', 'member', 'jama', 'network', 'consortium', 'peerreview', 'gener', 'medic', 'specialti', 'public']


# Master Dictionary - Index Construction

Constructing Master Dictionary

In [None]:
"""
example of an entry to master dictionary
master_dict = 
{
  'dog':
  {
    doc_number:
    {
      'tf':10, 
      'df':15, 
      'tf-idf':0.6
    }
  }
}
"""

master_dict = {}


#construct with headlines first
review = headlines

tf_factor = 5

# updating tf using headlines with weighted tf
for doc_num in range(len(review)):
  curr_review = review[doc_num]

  for curr_term in curr_review:

    if curr_term in master_dict:

      if doc_num in master_dict[curr_term]:
        master_dict[curr_term][doc_num]['tf'] += 1*tf_factor
      else:
        #put the doc_id for that term in master if doc_id not present
        master_dict[curr_term].update({doc_num:{'tf':1*tf_factor, 'df':0, 'tf-idf':0.1}})

    # put the term in master
    else:
      master_dict.update({curr_term:{doc_num:{'tf':1*tf_factor, 'df':0, 'tf-idf':0.1}}})


# updating tf using short description next
review = short_desc
for doc_num in range(len(review)):
  curr_review = review[doc_num]

  for curr_term in curr_review:

    if curr_term in master_dict:

      if doc_num in master_dict[curr_term]:
        master_dict[curr_term][doc_num]['tf'] += 1
      else:
        #put the doc_id for that term in master if doc_id not present
        master_dict[curr_term].update({doc_num:{'tf':1, 'df':0, 'tf-idf':0.1}})

    # put the term in master
    else:
      master_dict.update({curr_term:{doc_num:{'tf':1, 'df':0, 'tf-idf':0.1}}})


#master_dict
print(master_dict['cancer'])
 

updating df and tf-idf values

In [11]:



for curr_term in master_dict:

  for doc_num in master_dict[curr_term]:

    # len(master_dict[curr_term]) stores total doc numbers that term has or total docs in which that term present
    master_dict[curr_term][doc_num]['df'] = len(master_dict[curr_term])

    # calculates tf-idf using formula
    master_dict[curr_term][doc_num]['tf-idf'] = (master_dict[curr_term][doc_num]['tf'])*(math.log10(total_reviews/master_dict[curr_term][doc_num]['df']))




KeyError: ignored

L2 Norm Normalisation of tf-idf

In [12]:
norm_factor = {}

for curr_term in master_dict:

  for doc_num in master_dict[curr_term]:

    if doc_num in norm_factor:
      norm_factor[doc_num] += master_dict[curr_term][doc_num]['tf-idf']*master_dict[curr_term][doc_num]['tf-idf']
    else:
      norm_factor.update({doc_num: 0.2})
      norm_factor[doc_num] = master_dict[curr_term][doc_num]['tf-idf']*master_dict[curr_term][doc_num]['tf-idf']

# square root the norm factors

for doc_num in norm_factor:
  norm_factor[doc_num] = math.sqrt(norm_factor[doc_num])

# normalise tf-idf of master dictionary

for curr_term in master_dict:

  for doc_num in master_dict[curr_term]:

    master_dict[curr_term][doc_num]['tf-idf'] = master_dict[curr_term][doc_num]['tf-idf']/norm_factor[doc_num]



In [13]:
print(master_dict['cancer'])

#total unique words in corpus
print(len(master_dict))

{0: {'tf': 11, 'df': 495, 'tf-idf': 0.7001447668840078}, 9: {'tf': 7, 'df': 495, 'tf-idf': 0.6758749980547284}, 50: {'tf': 12, 'df': 495, 'tf-idf': 0.7801909034961719}, 136: {'tf': 8, 'df': 495, 'tf-idf': 0.7199476832629027}, 255: {'tf': 8, 'df': 495, 'tf-idf': 0.6332248202179284}, 264: {'tf': 7, 'df': 495, 'tf-idf': 0.47252204973718676}, 310: {'tf': 13, 'df': 495, 'tf-idf': 0.632868768544402}, 312: {'tf': 13, 'df': 495, 'tf-idf': 0.7492354203692209}, 358: {'tf': 9, 'df': 495, 'tf-idf': 0.4290672422622632}, 373: {'tf': 8, 'df': 495, 'tf-idf': 0.7625515090956372}, 391: {'tf': 8, 'df': 495, 'tf-idf': 0.6655585542935982}, 468: {'tf': 9, 'df': 495, 'tf-idf': 0.5909866062186152}, 492: {'tf': 7, 'df': 495, 'tf-idf': 0.374800006597582}, 512: {'tf': 8, 'df': 495, 'tf-idf': 0.5722886139707346}, 517: {'tf': 5, 'df': 495, 'tf-idf': 0.20025669085783512}, 528: {'tf': 7, 'df': 495, 'tf-idf': 0.6582763290530127}, 642: {'tf': 11, 'df': 495, 'tf-idf': 0.7377017935499602}, 694: {'tf': 6, 'df': 495, 'tf-

Save my files and load

In [4]:
import pickle

#save objects
def store_data():

  pickle_out = open("/content/drive/My Drive/IR Assignment/saved objects/master_dict.pickle","wb")
  pickle.dump(master_dict, pickle_out)
  pickle_out.close()

#store_data()


#load objects
def load_data():

  pickle_in = open("/content/drive/My Drive/IR Assignment/saved objects/master_dict.pickle","rb")
  temp_master_dict = pickle.load(pickle_in)
  pickle_in.close()

  return temp_master_dict

master_dict = load_data()



# Extracting results from query

Function to get results

In [9]:
def getQResults(myquery, srch_in):

  sample_query = [myquery]

  #preprocessing query before evaulation
  sample_query = preprocess_docs(sample_query)
  query = sample_query[0]
  print(query)

  query_dict = {}

  for query_term in query:

  # query term should be present in my master dictionary
    if query_term in master_dict:

      if query_term in query_dict:
        query_dict[query_term]['tf'] += 1
        query_dict[query_term]['df'] = len(master_dict[query_term])
        query_dict[query_term]['tf-idf'] = (1+math.log10(query_dict[query_term]['tf']))*(math.log10(total_reviews/query_dict[query_term]['df']))
      else:
        query_dict.update({query_term:{'tf':1, 'df':0, 'tf-idf':0.1}})
        query_dict[query_term]['df'] = len(master_dict[query_term])
        query_dict[query_term]['tf-idf'] = (1+math.log10(query_dict[query_term]['tf']))*(math.log10(total_reviews/query_dict[query_term]['df']))


  #normalise tf-idf for query
  q_norm_factor = 0;

  for term in query_dict:
    q_norm_factor += query_dict[term]['tf-idf']*query_dict[term]['tf-idf']

  q_norm_factor = math.sqrt(q_norm_factor)

  for term in query_dict:
    query_dict[term]['tf-idf'] = query_dict[term]['tf-idf']/q_norm_factor

  
  #calculating cosine similarity


  result = {} #to store all the documents as result and their cosine similarity score

  for query_term in query_dict:

    for doc_num in master_dict[query_term]:

      if doc_num in srch_in:

        if doc_num in result:
          result[doc_num] += query_dict[query_term]['tf-idf']*master_dict[query_term][doc_num]['tf-idf']
        else:
          result.update({doc_num: 0.1})
          result[doc_num] = query_dict[query_term]['tf-idf']*master_dict[query_term][doc_num]['tf-idf']


  sorted_result_lda = sorted(result.items(), key=itemgetter(1),reverse=True)
  
  final_list = list()

  for ind in range(len(sorted_result_lda)):
    final_list.append(sorted_result_lda[ind][0])

  return final_list


In [7]:
getQResults('Nature Biology')

TypeError: ignored

### Experta

In [10]:
!pip install experta
from experta import *

Collecting experta
  Downloading https://files.pythonhosted.org/packages/03/5d/c06fad9dadbec34d95f548bca648ec0de2afd6f8eb2247194150ad38ee8f/experta-1.9.4-py3-none-any.whl
Collecting schema==0.6.7
  Downloading https://files.pythonhosted.org/packages/5d/42/32c059aa876eb16521a292e634d18f25408b2441862ff823f59af273d720/schema-0.6.7-py2.py3-none-any.whl
Collecting frozendict==1.2
  Downloading https://files.pythonhosted.org/packages/4e/55/a12ded2c426a4d2bee73f88304c9c08ebbdbadb82569ebdd6a0c007cfd08/frozendict-1.2.tar.gz
Building wheels for collected packages: frozendict
  Building wheel for frozendict (setup.py) ... [?25l[?25hdone
  Created wheel for frozendict: filename=frozendict-1.2-cp37-none-any.whl size=3150 sha256=1a696fd74956f9ad1bb3582b5b6fc24b82c585a206fbbf2950988bc9230dc6ca
  Stored in directory: /root/.cache/pip/wheels/6c/6c/e9/534386165bd12cf1885582c75eb6d0ffcb321b65c23fe0f834
Successfully built frozendict
Installing collected packages: schema, frozendict, experta
Successfully

In [11]:
from operator import itemgetter

In [19]:
class JournalExtractor(KnowledgeEngine):
  @DefFacts()
  def _initial_action(self):  
    print("--Finding required Journals--") 
    yield Fact(action="find_journal")

  def setAtt(self,sub,i_fac,pub,key_w):
    self.subject = sub
    self.i_factor = i_fac
    self.publisher = pub
    self.key_words = key_w

  @Rule(Fact(action="find_journal"), NOT(Fact(subject=W())), salience=100)
  def input_0(self):
    self.declare(Fact(subject=self.subject))

  @Rule(Fact(action="find_journal"), NOT(Fact(i_factor=W())), salience=100)
  def input_1(self):
    self.declare(Fact(i_factor=self.i_factor))

  @Rule(Fact(action="find_journal"), NOT(Fact(publisher=W())), salience=100)
  def input_2(self):
    self.declare(Fact(publisher=self.publisher))
  
  @Rule(Fact(action="find_journal"), NOT(Fact(key_words=W())), salience=100)
  def input_3(self):
    self.declare(Fact(key_words=self.key_words))

  # function for subject
  @Rule(Fact(action="find_journal"),Fact(subject=MATCH.subject), salience=90)
  def action_sub(self,subject):
    res_t = []
    if subject == "All":
      for num in range(len(dataset)):
        res_t.append(num)
      self.temp_result = res_t
      print(f'RULE (Subject->All subjects); got {len(res_t)} documents')
      return
    for i in range(len(dataset)):
      if subject == dataset.iloc[i]['Subject']:
        res_t.append(i)
    self.temp_result = res_t
    print(f'RULE (Subject->{subject}); got {len(res_t)} documents')

  # function for i_factor
  @Rule(Fact(action="find_journal"),Fact(i_factor=MATCH.i_factor),salience=80)
  def action_i_fac(self,i_factor):
    res_t = []
    i_factor = int(i_factor)
    for i in self.temp_result:
      i_th = dataset.iloc[i]['Cites / Doc. (2years)']/100.0
      if i_factor <= i_th:
        res_t.append(i)
    self.temp_result = res_t
    print(f'RULE (Impact factor->{i_factor}); got {len(res_t)} documents')

  # function for Publisher
  @Rule(Fact(action="find_journal"),Fact(publisher=MATCH.publisher),salience=70)
  def action_pub(self,publisher):
    if not publisher == "All":
      res_t = []
      for i in self.temp_result:
        if publisher == dataset.iloc[i]['Publisher']:
          res_t.append(i)
      self.temp_result = res_t
      print(f'RULE (Publisher->{publisher}); got {len(res_t)} documents')
  
  # function for key_words
  @Rule(Fact(action="find_journal"),Fact(key_words=MATCH.key_words),salience=60)
  def action_kx(self,key_words):
    self.result = getQResults(key_words, self.temp_result)

  def getResults(self):
    return self.result

  

In [None]:
engine = JournalExtractor()
engine.setAtt("All","10","All","cancer nature")
#print(engine.subject)
engine.reset()
engine.run()
ans = engine.getResults()

print(ans)

for jour in ans:
  print()
  print(dataset.iloc[jour]['Title'])
  print(dataset.iloc[jour]['Subject'])
  print(dataset.iloc[jour]['Cites / Doc. (2years)']/100.0)
  print(dataset.iloc[jour]['Scope'])
  print()

# Flask Application

In [16]:
!pip install flask-ngrok

Collecting flask-ngrok
  Downloading https://files.pythonhosted.org/packages/af/6c/f54cb686ad1129e27d125d182f90f52b32f284e6c8df58c1bae54fa1adbc/flask_ngrok-0.0.25-py3-none-any.whl
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [23]:
from flask_ngrok import run_with_ngrok
from flask import render_template,request
from flask import Flask

app = Flask(__name__)
run_with_ngrok(app)   #starts ngrok when the app is run

@app.route("/")
def give_query():
    return render_template("index.html")

@app.route('/results',methods=["POST"])
def get_result():
  subj = request.form.get('subj')
  keyw = request.form.get('keyw')
  i_fact = request.form.get('i_fact')
  publ = request.form.get('publ')

  engine = JournalExtractor()
  engine.setAtt(subj,i_fact,publ,keyw)
  #print(engine.subject)
  engine.reset()
  engine.run()
  ans = engine.getResults()

  result_s = []

  for jour in ans:
    single_e = []
    single_e.append(dataset.iloc[jour]['Title'])
    single_e.append(dataset.iloc[jour]['Link'])
    single_e.append(dataset.iloc[jour]['Scope'])
    single_e.append(dataset.iloc[jour]['Cites / Doc. (2years)']/100.0)
    single_e.append(dataset.iloc[jour]['Coverage'])
    result_s.append(single_e)
  

  return render_template("result.html",q=keyw,res=result_s) 
  
def home():
    return "<h1>Running Flask on Google Colab!</h1>"

app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://41bb7872a3c6.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [22/Apr/2021 14:50:45] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Apr/2021 14:50:49] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Apr/2021 14:50:50] "[37mGET /static/master.css HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Apr/2021 14:50:50] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [22/Apr/2021 14:50:52] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Apr/2021 14:50:52] "[37mGET /static/master.css HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Apr/2021 14:50:53] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


--Finding required Journals--


INFO:werkzeug:127.0.0.1 - - [22/Apr/2021 14:51:09] "[37mPOST /results HTTP/1.1[0m" 200 -


RULE (Subject->Chemistry); got 460 documents
RULE (Impact factor->1); got 302 documents
['water']
