In [7]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [8]:
# Getting patent Info 

def get_patent_info_google_patents(patent_number):

    google_patents_url = f"https://patents.google.com/patent/{patent_number}"
    response = requests.get(google_patents_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    patent_info = {}

    patent_info["Title"] = soup.find("meta", attrs={"name": "DC.title"})["content"]
    patent_info["Publication Date"] = soup.find("meta", attrs={"name": "DC.date", "scheme": "issue"})["content"]
    inventors = soup.find_all("meta", attrs={"name": "DC.contributor", "scheme": "inventor"})
    patent_info["Inventors"] = [inventor["content"] for inventor in inventors]
    patent_info["Application Number"] = soup.find("meta", attrs={"name": "citation_patent_application_number"})["content"]
    patent_info["PDF URL"] = soup.find("meta", attrs={"name": "citation_pdf_url"})["content"]

    return patent_info

# Processing to get country code

def get_country_code_from_patent_number(patent_number, country_codes):
    code = patent_number[:2]
    country_name = country_codes.get(code, "Country not found")

    return code, country_name

# Preprocessing Patent Number

def process_patent_number(patent_number):
    return re.sub(r'[^a-zA-Z0-9]', '', patent_number)

In [9]:
# Country Codes for Patents

country_codes = {
    "AM": "Armenia",
    "AP": "African Regional Industrial Property Organization",
    "AR": "Argentina",
    "AT": "Austria",
    "AU": "Australia",
    "BA": "Bosnia and Herzegovina",
    "BE": "Belgium",
    "BG": "Bulgaria",
    "BR": "Brazil",
    "BY": "Belarus",
    "CA": "Canada",
    "CH": "Switzerland",
    "CL": "Chile",
    "CN": "China",
    "CO": "Colombia",
    "CR": "Costa Rica",
    "CS": "Czechoslovakia",
    "CU": "Cuba",
    "CY": "Cyprus",
    "CZ": "Czech Republic",
    "DD": "German Democratic Republic",
    "DE": "Germany",
    "DK": "Denmark",
    "DO": "Dominican Republic",
    "DZ": "Algeria",
    "EA": "Eurasian Patent Organization",
    "EC": "Ecuador",
    "EE": "Estonia",
    "EG": "Egypt",
    "EM": "European Union Intellectual Property Office",
    "EP": "European Patent Office",
    "ES": "Spain",
    "FI": "Finland",
    "FR": "France",
    "GB": "United Kingdom",
    "GC": "Gulf Cooperation Council",
    "GE": "Georgia",
    "GR": "Greece",
    "GT": "Guatemala",
    "HK": "The Hong Kong Special Administrative Region of the People’s Republic of China",
    "HN": "Honduras",
    "HR": "Croatia",
    "HU": "Hungary",
    "ID": "Indonesia",
    "IE": "Ireland",
    "IL": "Israel",
    "IN": "India",
    "IS": "Iceland",
    "IT": "Italy",
    "JO": "Jordan",
    "JP": "Japan",
    "KE": "Kenya",
    "KG": "Kyrgyzstan",
    "KR": "Korea (South)",
    "KZ": "Kazakhstan",
    "LT": "Lithuania",
    "LU": "Luxembourg",
    "LV": "Latvia",
    "MA": "Morocco",
    "MC": "Monaco",
    "MD": "Republic of Moldova",
    "ME": "Montenegro",
    "MN": "Mongolia",
    "MO": "Macao",
    "MT": "Malta",
    "MW": "Malawi",
    "MX": "Mexico",
    "MY": "Malaysia",
    "NI": "Nicaragua",
    "NL": "Netherlands",
    "NO": "Norway",
    "NZ": "New Zealand",
    "OA": "African Intellectual Property Organization",
    "PA": "Panama",
    "PE": "Peru",
    "PH": "Philippines",
    "PL": "Poland",
    "PT": "Portugal",
    "RO": "Romania",
    "RS": "Serbia",
    "RU": "Russian Federation",
    "SA": "Saudi Arabia",
    "SE": "Sweden",
    "SG": "Singapore",
    "SI": "Slovenia",
    "SK": "Slovakia",
    "SM": "San Marino",
    "SU": "Soviet Union (USSR)",
    "SV": "El Salvador",
    "TH": "Thailand",
    "TJ": "Tajikistan",
    "TN": "Tunisia",
    "TR": "Turkey",
    "TT": "Trinidad and Tobago",
    "TW": "Chinese Taipei",
    "UA": "Ukraine",
    "US": "United States of America",
    "UY": "Uruguay",
    "UZ": "Uzbekistan",
    "VN": "Viet Nam",
    "WO": "World Intellectual Property Organization (WIPO)",
    "YU": "Yugoslavia / Serbia and Montenegro",
    "ZA": "South Africa",
    "ZM": "Zambia",
    "ZW": "Zimbabwe"
}

good_country_codes = ["US","EM","RU","KR","AU","JP"]

In [10]:
# Getting Journal Name and Authors From DOI number


def get_journal_and_authors_from_doi(doi):
    url = f"https://api.crossref.org/works/{doi}"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            journal_name = data['message']['container-title'][0]  
            authors = data['message']['author']
            author_names = [f"{author['given']} {author['family']}" for author in authors]
            return journal_name, author_names
        else:
            print(f"Error 2. Status code: {response.status_code}")
            return None, []
    except:
        print("Error 1.")
        return None, []
    
# Getting the Link of Journal Data page from main ScimagoJR page

from bs4 import BeautifulSoup
from urllib.parse import quote_plus

def search_page(journal_name):
  if journal_name:
    search_name=quote_plus(journal_name.strip())
    search_url=f"https://www.scimagojr.com/journalsearch.php?q={search_name}"
    r = requests.get(search_url)
  else : 
    return None
  if r.status_code==200:
    soup = BeautifulSoup(r.content, 'html5lib')
    try:
      search_result=soup.find('div',class_="search_results").find("a")['href']
    except TypeError:
      search_result=None
      print(journal_name,"No results were found")
  else:
    search_result=None
    print(journal_name,'Page not found 0')
  return search_result

#Getting Quartile from Journal Data

import requests
from bs4 import BeautifulSoup
import pandas as pd

def quartile_list(search_result):
  if search_result!=None:
    result_url=f'https://www.scimagojr.com/{search_result}'
    try:
      response = requests.get(result_url)
      soup = BeautifulSoup(response.text, 'html.parser')
      quartile_th = soup.find('th', string='Quartile')
      if quartile_th:
        quartile_table = quartile_th.find_parent('table')
        if quartile_table:
            df = pd.read_html(result_url,match='Quartile')[0]
            latest_rating_indices = df.groupby('Category').Year.agg('idxmax')
            latest_ratings = df.loc[latest_rating_indices]
            unique_ratings = latest_ratings['Quartile'].unique()
        else:
            unique_ratings=None
            print(result_url, "No quartile category found")
      else:
            unique_ratings = None
            print(result_url, "No quartile found on page")
    except requests.HTTPError:
      unique_ratings=None
      print(result_url,"Page not found")
  else:
      unique_ratings=None
  return unique_ratings

# Getting DOI data from Google Spreadsheet

import gspread
from oauth2client.service_account import ServiceAccountCredentials
import numpy as np
def get_doi_list_from_spreadsheet(sheetName):
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
    credentials = ServiceAccountCredentials.from_json_keyfile_name("facultyrecruitmentupgradation-5e06eb0edef3.json", scope)

    gc = gspread.authorize(credentials)
    worksheet = gc.open(sheetName).sheet1

    data = worksheet.get_all_records()
    doi_dataframe = pd.DataFrame(data)
    doi_check = np.array(doi_dataframe['CATEGORY']).tolist()
    doi_done = 0
    for i in range(len(doi_check)):
        if doi_check[i] != '':
            doi_done+=1
    doi_data = doi_dataframe['DOI NUMBER']
    doi_list = np.array(doi_data).tolist()
    doi_list = doi_list[doi_done:]
    for i in range(len(doi_list)):
        if str(doi_list[i][0:15]) == "https://doi.org":
            doi_list[i] = doi_list[i][16:]
    return doi_list,doi_done
sheetName = "ApplicationsData"


# Post Processing Incorrect Entries

def post_process_invalid_inputs(Journal_name,doi):
    if Journal_name:
        Journal_name = Journal_name.split(':')[0]
        return Journal_name,doi
    else :
        return None,None
    

# Writing into Google Spreadsheet 
  
def enter_into_sheet(df,sheetName,start):
        scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
        credentials = ServiceAccountCredentials.from_json_keyfile_name('facultyrecruitmentupgradation-5e06eb0edef3.json', scope)
        gc = gspread.authorize(credentials)
        spreadsheet = gc.open(sheetName)

        worksheet = spreadsheet.worksheet('Sheet1')  
        worksheet.delete_rows(start, start + len(df) -1)
        data_sheet = []
        n = len(df)
        for i in range(n):
                data_sheet.append(list(map(str,np.array(df.loc[i]).tolist())))
        worksheet.insert_rows(data_sheet, start)


# Converting Quartile Cateqory to score

def quartile_to_score(quartile):
    if (quartile):
        if ('Q1' in quartile):
            return 2
        elif ('Q2' in quartile or ('Q2' in quartile and 'Q1' in quartile)):
            return 1
        elif (quartile == 'Q3'):
            return 0.25
        else:
            return 0
    else :
        return 0


In [11]:
# Getting book points 

def get_books_points(author_name, api_key):
    base_url = "https://www.googleapis.com/books/v1/volumes"
    params = {
        "q": f"inauthor:{author_name}",
        "key": api_key
    }
    book_sum=0
    try:
        response = requests.get(base_url, params=params)
        data = response.json()
        if 'items' in data:
            books = data['items']
            no_of_books=len(books)
            for book in books:
                authors = book['volumeInfo'].get('authors', [])
                no_of_authors=len(authors)
                identifiers = book['volumeInfo'].get('industryIdentifiers', [])
                no_of_chapters=len(identifiers)
                if no_of_chapters<=0:
                  continue
                if no_of_authors<=0:
                  continue
                if no_of_chapters==1:
                  book_factor= 1
                else:
                  book_factor=5
                contribution=1/no_of_authors
                book_sum+=(contribution*book_factor)
            return book_sum
        else:
            print(f"No books found by {author_name}")
            return

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")




In [12]:
# Research points

def find_research_points(patent_list, doi_list, author_name ):

    # Paper Points
    # paper_contribution = 1/number of authors
    paper_points =0
    data=[]
    for doi in doi_list:
        journal_name, author_names = get_journal_and_authors_from_doi(doi)
        search_result=search_page(journal_name)
        quartile=quartile_list(search_result)

        if quartile == None:
            journal_name = post_process_invalid_inputs(journal_name,doi)[0]
        search_result=search_page(journal_name)
        quartile=quartile_list(search_result)
        data.append([doi,journal_name,quartile,len(author_names)])

    df=pd.DataFrame(data,columns=['DOI','Journal name','Category','No of authors'])
    for i in range(len(df)):
        quartile_score = quartile_to_score(df['Category'][i])
        number_of_auth = df['No of authors'][i]
        paper_contribution = 1/number_of_auth
        paper_points += quartile_score*paper_contribution

        

    # Patent Points
    
    patent_points = 0
    for patent_number in patent_list:
        patent_number = process_patent_number(patent_number)
        patent_info = get_patent_info_google_patents(patent_number)
        if (patent_info):
            number_of_inventors = len(patent_info['Inventors'])
            good_patent_impact = 5
            bad_patent_impact = 3
            inventor_contribution = 1/number_of_inventors
            country_code = get_country_code_from_patent_number(patent_number,country_codes)[0]
            if (country_code in good_country_codes):
                patent_points += good_patent_impact*(inventor_contribution)
            else :
                patent_points += bad_patent_impact*(inventor_contribution)
        else :
            patent_points += 0
    
    # Book Points
    api_key = "AIzaSyCEwBEmxq5Cc3HEZNmmy90eMLBfiiRCybE"
    book_points = get_books_points(author_name, api_key)
    print('Paper Points : ',paper_points,"\nBook Points : ",book_points,"\nPatent Points : ",patent_points)
    research_points = paper_points + patent_points + book_points
    return research_points
    
patent_list = ["US10666504"]
doi_list = ["10.1177/03611981211064994"]
auth_name = "Arup Lal Chakraborty"
print(find_research_points(patent_list, doi_list, auth_name))

Transportation Research Record: Journal of the Transportation Research Board Page not found 0
Paper Points :  0.25 
Book Points :  1.0 
Patent Points :  1.25
2.5
