In [52]:
import pandas as pd
import numpy as np
import requests
import datetime
import os
from datetime import datetime as dtime
from ast import literal_eval
import time
from pyaltmetric import Altmetric

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [53]:
# Alle Zeilen und Spalten des DataFrames anzeigen
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', None)
# np.set_printoptions(threshold=sys.maxsize)

In [54]:
def getSSdata(paperId):
  
  #######################
  #     SEND REQUEST    #
  #######################
  try:
    api_key = 'rGUKYOEpCP2FKQK88CLuB1izZBvDiQwA5SsSZ5vo'
    url = "https://api.semanticscholar.org/graph/v1/paper/"+paperId.replace(' ', '').strip()+"?fields=title,referenceCount,citationCount,influentialCitationCount,externalIds,authors.hIndex,authors.paperCount,authors.citationCount,authors.affiliations,authors.paperCount,authors.name,fieldsOfStudy,publicationTypes,publicationDate,year,journal,isOpenAccess"
    req = requests.get(url=url, params={'x-api-key': api_key}).json()

    # Wait 5 Minutes and retry if API-Server sends 'Too Many Requests'
    while req.get('message') == 'Too Many Requests':
      print('[Too Many Requests]: Waiting 5 minutes.')
      time.sleep((60*5)+5) # Wait 5 Minutes + 5 Extra-seconds for tolerance
      req = requests.get(url=url, params={'x-api-key': api_key}).json()

    ############################
    #     PROCESS RESPONSE     #
    ############################
    df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in req.items() ]));

    # Features aus verschachteltem JSON-Array extrahieren und in separate Spalten teilen
    for col in ['DOI', 'CorpusId']:
      try:
        df[col] = df.loc[col].externalIds
      except:
        df[col] = np.nan
    
    df['paperId'] = df['paperId'].iloc[0]
    df['title'] = df['title'].iloc[0]
    df = df[df['externalIds'].isna()]
    df['journal_name'] = df.loc['name'].journal if df.loc['name'].journal else np.nan

    # Belegte Seiten des Journals in dem das Paper veröffentlicht wurde berechnen
    try:
        # Manche Angaben werden in ihrer Schreibweise verkürzt... z.B.: 153-57, dies soll wie 153-157 behandelt werden, da sonst negative Zahlen bei der folgenden Berechnung entstehen.
        df['journal_pages_occupiedCount'] = extractOccupiedJournalPagesCount(df.loc['pages'].journal)
    except:
        df['journal_pages_occupiedCount'] = 0

    # MV filtern
    df = df[~df['authors'].isna()][['DOI', 'CorpusId', 'paperId', 'title', 'referenceCount', 'citationCount', 'influentialCitationCount', 'fieldsOfStudy', 'publicationTypes', 'journal_name', 'journal_pages_occupiedCount', 'isOpenAccess', 'publicationDate', 'year', 'authors']]

    # Author Name extrahieren
    df['author_names'] = df['authors'].apply(lambda x: x['name'])
    df['author_names'] = str(df['author_names'].to_list())
    df['author_names'] = df['author_names'].apply(literal_eval)

    # # Author Affiliation extrahieren
    # df['authors_affiliations'] = df['authors'].apply(lambda x: x['affiliations'])
    # df['authors_affiliations'] = str(df[~df['authors_affiliations'].isna()]['authors_affiliations'].to_list())
    # df['authors_affiliations'] = df['author_affiliations'].apply(literal_eval)

    # h-Index aller Autoren extrahieren
    df['hIndex'] = df['authors'].apply(lambda x: x['hIndex'])

    # Avg, min, max h-Index über alle Autoren berechnen, runden und in Ganzzahl umwandeln
    df['authors_sum_hIndex'] = df['hIndex'].sum().round(0).astype(int)
    # df['authors_mean_hIndex'] = df['hIndex'].mean().round(0).astype(int)
    # df['authors_max_hIndex'] = df['hIndex'].max().round(0).astype(int)
    # df['authors_min_hIndex'] = df['hIndex'].min().round(0).astype(int)

    # paperCount aller Autoren extrahieren
    df['author_paperCount'] = df['authors'].apply(lambda x: x['paperCount'])

    # Avg, min, max paperCount über alle Autoren berechnen, runden und in Ganzzahl umwandeln
    df['authors_sum_paperCount'] = df['author_paperCount'].sum().round(0).astype(int)
    # df['authors_mean_paperCount'] = df['author_paperCount'].mean().round(0).astype(int)
    # df['authors_max_paperCount'] = df['author_paperCount'].max().round(0).astype(int)
    # df['authors_min_paperCount'] = df['author_paperCount'].min().round(0).astype(int)

    # citationCount aller Autoren extrahieren
    df['author_citationCount'] = df['authors'].apply(lambda x: x['citationCount'])

    # Avg, min, max citationCount über alle Autoren berechnen, runden und in Ganzzahl umwandeln
    df['authors_sum_citationCount'] = df['author_citationCount'].sum().round(0).astype(int)
    # df['authors_mean_citationCount'] = df['author_citationCount'].mean().round(0).astype(int)
    # df['authors_max_citationCount'] = df['author_citationCount'].max().round(0).astype(int)
    # df['authors_min_citationCount'] = df['author_citationCount'].min().round(0).astype(int)

    df['authorsCount'] = len(df['author_names'])

    # Redundante bzw. irrelevante Features entfernen
    df = df.iloc[0][['DOI', 'CorpusId', 'paperId', 'title', 'author_names', 'isOpenAccess', 'publicationTypes', 'journal_name', 'journal_pages_occupiedCount', 'publicationDate', 'year', 'fieldsOfStudy', 'referenceCount', 'citationCount', 'authorsCount', 'authors_sum_hIndex', 'authors_sum_paperCount', 'authors_sum_citationCount']]

    # Datentyp in Ganzzahl umwandeln
    df['referenceCount'] = df['referenceCount'].astype(int)
    df['citationCount'] = df['citationCount'].astype(int)
    # df['influentialCitationCount'] = df['influentialCitationCount'].astype(int)
    if df['year']:
      df['year'] = df['year'].astype(int)

    # print('- Collected Semantic Scholar metadata.')
    return df
  except Exception as e:
    print(e)
    return pd.DataFrame()

### Get List of Nobel Prize winners <br> see: https://www.nobelprize.org/about/api-examples/

In [55]:
base_url = 'http://api.nobelprize.org/v1/prize.json'
search_query = '?year=1900&yearto='+str(datetime.datetime.now().year)
# search_query = '?category=medicine&year=1990&yearto=1994'

In [56]:
nobelprize_data = requests.get(base_url+search_query).json()

In [57]:
nobelprize_df = pd.DataFrame(nobelprize_data.get('prizes'))

In [58]:
def extractAuthorNames(laureates_col):
    laureates = []
    if laureates_col:
        try:
            for i in laureates_col:
                surname = i.get('surname')
                firstname = i.get('firstname')
                if surname:
                    full_name = firstname[0] + '. ' + surname
                    laureates.append(full_name)
                elif surname==None:
                    laureates.append(firstname)
                return laureates
        except:
            pass

In [59]:
nobelprize_df['laureates'] = nobelprize_df['laureates'].apply(lambda x: extractAuthorNames(x))

In [60]:
nobelprize_df.drop('overallMotivation', axis=1, inplace=True)

In [61]:
# List of Nobelprize winners
nobelprize_laureates = nobelprize_df.explode('laureates')['laureates'].unique()

In [62]:
# nobelprize_laureates

In [63]:
def isNobelPrizeLaureate(authors):
    # year = df['year']
    # fos = df['fieldsOfStudy']
    isNobelPrizeLaureate = False

    for author in authors:
        if author.lower() in [str(item).lower() for item in nobelprize_laureates]:
            isNobelPrizeLaureate = True
    
    # print('- Checked if paper has Nobel Prize Laureate as contributer.')
    return isNobelPrizeLaureate

In [64]:
def getNobelPrizeLaureateCount(authors):
    nobelPrizeLaureateCount = 0

    for author in authors:
        if author.lower() in [str(item).lower() for item in nobelprize_laureates]:
            nobelPrizeLaureateCount+=1
    
    # print('- Checked if paper has Nobel Prize Laureate as contributer.')
    return nobelPrizeLaureateCount

In [65]:
# ss_df['hasNobelPrizeLaureate'] = hasNobelPrizeLaureate(ss_df)

### Altmetric

In [66]:
def getAMdata(paper_doi):

    try:
        # Altmetric-Objekt erstellen
        a = Altmetric()

        # In Altmetric-DB nach Paper mit entsprechender DOI suchen (API antwortet mit JSON-Array)
        altmetric_json_data = a.doi(paper_doi)

        # DataFrame erstellen und in geeignetes Format transformieren
        df = pd.DataFrame.from_dict(altmetric_json_data.items()).transpose()

        # DataFrame filtern
        df.columns = df.iloc[0]

        # Altmetric Score aufrunden
        df = df.iloc[1]
        df['score'] = np.ceil(df['score']).astype(int)

        # Alle Spalten extrahieren, die 'count' oder/und 'score' enthalten (z.B. gibt es nicht bei jedem Paper die Information wie viele Reddit, Wikipedia etc. Einträge vorhanden sind)
        col_list = []
        for col in df.keys():
            if any(x in col for x in ['count', 'score', 'doi']):
                col_list.append(col)

        # DataFrame nach den extrahierten Spalten filtern
        df = df[col_list]

        # Spalten umbennenen, damit zum Schluß nach dem Konkatenieren leichter nachvollzogen werden kann, aus welcher Quelle die Daten stammen.
        for col in df.keys():
            if col in col_list:
                df.rename({col:'altmetric_'+col}, inplace=True)

        # print('- altmetric_score: ' + str(df['altmetric_score']))
        return df
    except Exception as e:
        # return pd.DataFrame()
        # print(e)
        print('---------- No Data found for <' + str(paper_doi) + '> ----------')
        pass

### Paperlist

In [67]:
hamza_paper_list = [
    # Agricultural and Food Sciences - Prizewinners                      (Hamza)
    '9151910efb402f695ec52f006c5628fb29791d49',
    'c1bb306d07f67d1989783ad32a636dfca44d1edf',
    '06c541de524d2352116ef3c6a0af318a1b83cf1c',
    '083a8f455e8bdf3c9d72ac81c8215db5b09def68',
    'b2406cc0373f521897a046108a48c21e7c08f9ca',
    '8e076cc26a432ca104dbcb592814405264817363',
    '42b777abf798e34df20df87a43d489af7fd4f2c2',
    '66b460039fdb0b87c82030cebd3cc224b2763da1',
    'e932c974074cc4a38f02fa7d674932eef81faf61',
    '613c5dae158288ec4aae6bca752ec1a56d6cdfbc',
    # Art - Citation Count
    '16b4fd36ff5ca603fed13a326054cd0373ea442c',
    '08f383a84d6d3f6f44c1a3f0296455baeea2930d',
    'ede81710759ae46afd4634bc847e4c01adeb000c',
    'e6ac191117f8e556f85a4b53debeb6acfa51381d',
    'ce1323ffc943f3ccccf7b9d0e74db4d57fc290cd',
    '3c9dc71914a48574811458ce0056831a0fa31f52',
    '405dea794fe583b910f8ff4b147d91350e4f33f3',
    '2c411a12f33f15451e1659a3435391962c0cc144',
    '55545d2c73cce5d3f2ad42f04b6cbcee5a2d4144',
    # Biology - Prizewinners
    '1a7af416aa387606d3645ff3b4513a3d5b95e3d8',
    '672def367c49ba4463825d5a771d6a5f48fefb99',
    '0107ab33dc0e27934ca11c826cc65852b1ea53fb',
    '105ddd9b36341cc5135c8fd7fd8dc4d621b758cf',
    'bdeb51634bca711d7bb0d31d168262279b74520f',
    # Biology - Citation Count
    'a411f6a0e6473137ac1a538f7cee65722fa3584f',
    'fd495d6cf7c3169bc58550fdf32be6e16e2800f8',
    '82e320e06b1c717b0d924d257aa7b6710f53a38e',
    '61a28477008aef4f7cabacdd6ec9f42e3b1d9214',
    '30eecc8a7b7346a5e0c3a6648b0e156faad3a786',
    # Business - Prizewinners
    'c619e6fbebab17037b6aff18890f79e7a0c85841',
    'bbf2d2cb15c0c6db231a09ad4f51e2b24b20b5dc',
    'e2d9ef600adcf0a2e8252777cf5e4d917e99b212',
    'c130a72466976096230a31c23f4201f13ce33ea5',
    # Business - Citation Count
    '12be757eabdaf30457b14e06b510a76d7c4e0328',
    '8f53f3d9b99b00a44b8a1554c01bde274dd1ebe9',
    'eb7c608c63e71dac1d0c45711ad52989c54ea1fc',
    '7bd9a25227ca4378b86a78994817988863a54b60',
    'e87c3be7aaa1c7ce91f94fda0815339ac02af787',
    'a5b65b05c765025b30503a3705a13a5c490c2fde',
    # Computer Science - Prizewinners
    '8964838cb60e31c13fda81e10fa75f476e10a126',
    # 'DOI:10.1145/323779.323751', NONE-TYPE
    '8854e412a9367a76deb2168407bb3aa065009abd',
    '07a152ad1c17b35396d8b372cbde16e89705c7ec',
    'ebefbee5e96fc2217d651266eb35a0759672e7ca',
    # Computer Science - Citation Count
    '8ae2c7b50bf64d8d647afad24d95c12a79656da3',
    # 'DOI:10.1007/978-1-84882-935-0', NONE-TYPE
    '7c72b917a38b09e6d3ab19d28a4344ba54edb6ae',
    # 'DOI:10.2307/2322693', NONE-TYPE
    'd2d36f50543c65594548953674c26c96295afdd5',
    # Chemistry - Prizewinners
    'b5b2a56539b2da468790b5d690efb7d345344e65',
    'f2259c97773f8e7f5502b588e275546dad142086',
    '58aae2c2c9b059124f7168af4611059d6c4fa224',
    '2e2414231b6c9c8e707d48c3820ceea0e12dc3c7',
    'e3fc26b64a26bc22336ec3e95c540f0042dbce61',
    # Chemistry - Citation Count
    '05b9e01b36be8f688cdbd7131b89920fa46ac048',
    '88e23044396b7c63e831ad0195b6184ea3a12097',
    'e0b46e54a742c077006418a7e132e8105371829f',
    'ea77a4ba7d6cab863de3e8c5e11fcfc850d51c02',
    'c5d7b0fafdd72f57f2b0bfdd0ce3608a2528b666',
    # Economics - Prizewinners
    'a09892f1d156cdc07532aec355bb55df18c5885e',
    'b35af322333811bd16eb5b569466ad76909c0a20',
    '2d28a52e59005b2d3e23c9366be880a960dc1eaf',
    '00167b90b8fbf57d22b661c32ee8fb39020af12f',
    'fbadde2bf6e2b657dc81c5453f920a979d6967f7',
    # Economics - Citation Count
    'b6d7e6a2763da443c3386edfe70bf46c07059da1',
    '72910077a29caf411dbb03148997c72b47e65ab0',
    'c04677651cb0545ccf080abeb0958fc907f70c4a',
    'a6462e546ad432ddbe3e7d9e5854f029204f57e6',
    '6732435ab6e0eb4e0d14662cf724f3be54807a35',
    # Education - Prizewinners
    '0494a3422778413abca5a53932d8344347c033e3',
    '9930b98abeb97374c73cf359cb61521b6ad3d535',
    'dc3e4d8f60dccde3e0ea99bc6ed768e14fc4eb3d',
    '2096ac46a56a1fbeb1f4008894c3ca62b34c4893',
    '5f9ce9c565eadc52ca79ec205f64c59e488d56f8',
    # Education - Citation Count
    '94e8a06db15e275e82b98529e475c6c17a342300',
    'cee6c4973596fb320af3d871bf1920c8bc376adc',
    'bc00faabf97a200b359ffbf221d293e20fe03a89',
    '8d108f98cd67a98b3f48dade99c2cd13028d28b9',
    'd8c69675f42bffffda030e61c2893deca92513fe'
]

In [68]:
maurice_paper_list = [
    # Engineering - Prizewinners (Charles Stark Draper Prize, ASME Medal)                                 (Maurice)
    '39aad6b860cab181a77ec32809f55d4ebb348335',
    '16b52fed612e06f0d2a038e981e81e18d0eea14a',
    'ce65f83ca6cbe47f96194ad0e992ec8b7f500bbc',
    '34e22b27e9ad9421f8fa541da3a28b9909823fc8',
    '392bca415565c3a38c52446c95d6a1e88ec49e1a',
    '5723cb47076163a974f002bdd03b001976ae8185',
    # Engineering - Citation Count
    '896172e1d3a5d8a06ce6ef0187f53f1cf3803755',
    'a905bb6b0da059d854548f7c34b0cddad9a3b529',
    '0b9193580334b1529287aed6a91d1221f09126dd',
    '989081b60ae3cf4ecc7a0f739038bfaf61ea9b9e',
    'f4b83189b475ec53a85ff99be51b7c3ac93c846d',
    # Environmental Science - Prizewinners (Tyler Prize for Environmental Achievement)
    '79b4b48c6481becde78bd6287386d6507dac27b1',
    'e334406eda2f5e5f9f4783b5b741d4d324b0ac64',
    '45c675bb104824664e6b5dba9ff65c5c58e39bb5',
    '9dc534fff352b413de81dfddfba9f4d0990f079c',
    'fdc354d08984f0b36be818b8a287a767cad727ed',
    # Environmental Science - Citation Count
    'a765ee092121aadc1ecc0c36a3240f9e9ea178e8',
    '88df709c39fd7b19104d7cf878c990da617192d1',
    'd571963a597742e6aa80f43db9446517501b3e20',
    '9f5407ba528658f641d236bf9d601c7b3ed81e2f',
    'c1e25afc8eba2f505f38c68843ee23d97470dbaa',
    'b49fcb19d397ad2120e5219f3d0aef33879abb6a',
    # Geography - Prizewinners (Vautrin Lud Prize)
    '0554e5abd44b7c0edad0575b8a327c32f443d942',
    '79d8c6b940ed9524f11a59ce891968d17b335ed5',
    '10f3a1d8ecf0df2993a3cf679ff31e01ce32cff7',
    '05eba6fe8603ae0a564022fb4fe9faed0454afd1',
    '7a9cce42bbc8c8ab6bd6196c1f5fabbf87f19e5b',
    # Geography - Citation Count
    'c0d6f96ffc156bab46fe3097f8a50de5a62eb9f4',
    '2bfd55d8612c872122499417b9de1c70cda9f562',
    '6aabdd609b09fbd1c9244a8db33155b3feaf604a',
    '310ba0bc8e14e96c1a0566b1dae57c77a960de12',
    '497e2542e47e4c0fe8b4051f13ea6e0c6fdd6bde',
    # Geology - Prizewinners (Vetlesen Prize, Wollaston Medal, Penrose Medal)
    '075d5f246c30a6ec5acf8f3f2ef415d20981fa81',
    'eaa7d6df21e3a097ab902e19ff2f2905aafb7ab3',
    '8d5fc8de8f5e05b1911c4dac9826e64575c97e22',
    'f5f465ae3fb26f4256c8005b667794e0a6377d31',
    '67f5af9e020107e032b26d3ea293e5c48b84ced9',
    # Geology - Citation Count
    'af8d81f43ea661762b5c1debda4320afe2c28265',
    '2693cac78b1de1ef98ac90b8b704525985359f60',
    '55cd56bab78f7e686810acb7dd244a8cf0c18ccf',
    '37c13877e62e988868293c19239fe8389a213c7d',
    '13a2ead81b81a2abefc31a25f2af26a374ca1cb2',
    # History - Prizewinners (Pulitzer Prize for History)
    '9d3556bf7d2bc0e43f37aae1117e1fb2ea1b8b3c',
    'd510f874940b5b7d9ced2196182119156c3a4aa6',
    '3cd456ad75f18addaf2e98e821854a7749f58dcd',
    'a756098d7ca6017a31192216a14d67d163c8f70b',
    'a899f63b436aa9d06141bb5ea8edc69f32cb8eb9',
    # History - Citation Count
    '84ed742c2ecda9f3960575b92ce84796d6061906',
    '796aaef0da73624ae5439d42465c1d7098a3f430',
    '4415d3f35bcc2525c03630924780b9f5ef06eacd',
    'df3d2f2c490ec5279b171189087ba7902fdd6603',
    'd801a133aa8717c34dcfd4ea86f3d5f91dbdb0fd',
    'e3df8744a7843a48b6591c1cf439db12a1257bd2',
    # Law - Prizewinners (War schwierig... Stockholm Prize in Criminology und zusätzliche Paper wie in Law - Citation Count und Anthony Bradley Prize von Edinburgh Law School)
    '96d7859d70bd6e9eeb4827f892592f117c49c1fd',
    '615f6a06bab74056c43997940cca02eaefe56721',
    '94abff3a3142809bd205254e34776647f83657d6',
    '1c26a95b13c69234a7f6426add92df5c53e035ea',
    'fbf58d40593b7207d3d5fe9068681e22a05d9d87',
    'be2774f0fa504ac0c9387645f7c79a93a804b8db',
    # Law - Citation Count
    '711a36f214aaa6438bcb7250e7b902992b3fb878',
    'e102c6fa4a720e32583e25822bb63d34174af1af',
    # 'Corpus ID: 32200690', NONE-TYPE
    '16cbfba326248c9051a97bce4d6edd67e95c6be8',
    'd3a0d52d8a0e94f8a7cec5d7d5db684024573537',
    # Linguistics - Prizewinners (E.W. Beth Dissertation Prize, Linguapax Prize)
    'ccfa90f05ddf7c5fe2ea83b64a58285eace8e135',
    '02fe377dc583cd10d0f5e042627f1e43c6531a8f',
    'dae0cf248c7f3482663baa55330de22662d39a80',
    'd6bb4f3b28e97c20d8c58af5cd9751fe1ef905d9',
    'fdcc6cdc5be85455e138c0d97f58e6084394a0e4',
    '26e408cd6a0213bcbed83290bcb2edfc7346c859',
    # Linguistics - Citation Count
    '3ac11ca7c2e3e81159c67d8e2f753574e72046c1',
    '7ba33b812df45064ab0ddda8059a9da4258ffb1e',
    '4bcac104568f8aef9d82abc76250c11e8a309297',
    'b1e48825c647aeeb9c37250a93e863fb4eb77124',
    '8ec7a7b28f5237fa429e380f266f33548f3cff59',
    # Materials Science - Prizewinners (Von Hippel Award)
    'd224a1027b47cece2982c400162e053192423b1b',
    'b1bde690f0500785e0fe93ca6e01300ae649aa9d',
    'fe297418763154fa4a17273011c11da2c58e42d2',
    '3e88d99a60d1ed795d8a07fe90587f2828132506',
    '19128a8a410bab123ea5ab69fa8a5efd0be2d1f7',
    # Materials Science - Citation Count
    '300f6e755d2e1fd1e3c42804c724f988b7b0e1a1',
    '49c4bc07b0047425117d89935c27aa4d22c5c174',
    'bb29163f8f6344695310348c66cec412acbf428b',
    '7ad70a564c4c8898f61eddbcecf9e90f71e9d4b6',
    'd0ed27e58992eb1d289d35f7b902e5b26688af38'
]

In [69]:
ramses_paper_list = [
    # Mathematics - Prizewinners                                     (Ramses)
    '37b0ec9dc369dcda843cf88755bf44522a11a7c9',
    'ece822593c903e234820c6c7da060654aacaed38',
    '23b18110e67223e6b17a530053dc677ba501c636',
    '7d2333730401777adad2f30afe6af807fc22e31c',
    '53be251f8eaf26027db63a46052b1d8a7dd86684',
    # Mathematics - Citation Count
    'c0cd4b4844c31a27a7900a754d0a91b160b00e55',
    'c9c852bc0e8774336734ff339c1592b2cf2d849f',
    '9d0c4389436d3ab8ed329dba8c58e8ac6737fd3b',
    '92d5f6f2d13484c688ca4c08c1279229ba266089',
    'e60bd78ca8621c3938f740947d985c2ea5ff5d67',
    # Medicine - Prizewinners
    'be75109902f5689f7114e9e0fa783a12ceaa9b3a',
    '9632ca098599970f1795dcc5d4d5ae5a8d2f182d',
    '29039649db9dc86bb6396b18c4f99fc525606dd9',
    '335ac605e1f46efe2e5e62d75b376c70cdbb59ec',
    'af55a840fb0c6e4004c4acfff588c5a7862251bd',
    # Medicine - Citation Count
    '951865c6d89e9e7822700dad42caf8bd29896d48',
    'c416071f485c1d220792c56682cd09a5443eced1',
    'f134abeaf9bfd41f29b97aec675ec31895bf541d',
    'a0b2f25195269092afe516ce68caec8dfad33ac9',
    '6365c7e578b3ba30b85560515f8b7956a2a915cd',
    # Philosophy - Prizewinners
    '59711cdc12249347d94ce81b4168c1363b0bf994',
    'a627cb00f25a916df243e16ea0ae7d0268f951ca',
    '3e7505d16145d3aa154c28caaab83d424b907a0d',
    '1796b1d9fdf737db18552dcc6b88dfda4ed03aca',
    'ef8cf4de553248d1ffe2e79a01a33265284187cc',
    # Philosophy - Citation Count
    'c19d65d5097d23cdf0bcd11f1c58c8924254b5d2',
    '7fe9b067babc0812c1f31b04626397111635fa9d',
    '72ef83b0d5bb0be184b7730949d6ef7241a8b02d',
    '69cfb48c45b59243d60342b796dbac35e9efd6bc',
    'a9ca9fe75c9cbe15754481a5baa6d85593c8fae0',
    # Physics - Prizewinners
    '7658efad8951194bc7c7e8a3709c5a30660bc893',
    'd0c9b48422ff2239477b183c2211e20eaea795ab',
    '8cec28449f2f21e79c46b0e51fe9f8669c5c500e',
    '286f60799abe6a47f7da040cb875de170eb56bb4',
    '1a4e35f65bb43146795b46ab4e3f525348349044',
    'e6ecd94a6256798f2cfaf3cfeda4906eb3bca1e7',
    # Physics - Citation Count
    '47552b2aa5f1137bc46b01daa91b9175837cc380',
    'e1d1e7dc2606b1d60cb85057ab7c5bbd52067661',
    '8823290166bf24efc7cfda4d251f959b5f5f8e4c',
    'b9bdcbf59cf6f65b60b47004cfa85f1eeb3bbad7',
    '060a3c0274fb07daeeb85aa46b191f862abd8a09',
    # Political Science - Prizewinners
    '97efee042acc9673a81e6fa106f3680d4a03852c',
    '2fadc276a5aec2c417337f6bf29c976bddd2f03f',
    '40a8e9a91a059277fbe9cfd12e9877013024fc75',
    '87392ddf2fd4102df9b3bf411edbeb83e065075d',
    '6074dc981d7f07b866039a7b856d51ce966331bc',
    # Political Science - Citation Count
    '1e863a09164d94eb1553bb0a100be974dfe904ff',
    '82baedbe57a232a83bea6d88f16d66cf4acdca64',
    # 'DOI:10.1201/9781482246582', NONE-TYPE
    '74d0e14c7aa617a43875d6cef35c4622d1305949',
    '4323935e72706086995184a39c7f21e66eebe200',
    # Psychology - Prizewinners
    '9e607237e70fa3129da500ea1137a0d5d8633ddb',
    'bc9e4479b0ff89a10ca0e1e88a6b5af8ed3e1354',
    '36b332ac960663cea94abc5dbf515d831bfaff36',
    'c7fbea9d039e93fc721f0cd1538b5ac04b4f0e73',
    'f63d07713b1d7ce3cd8110e7ba839e2d265e9b59',
    # Psychology - Citation Count
    '98882d201890680e156fef2b111630f401168092',
    'a5bca3e6a7fa2135e566f040926a9687cc3ae7f9',
    'bbbff45c27dccd114818ef334075db0a34b0e4fa',
    '6ea24fcbbe780ce565251757b0a8cec5c753b90d',
    'c15e53dffbfe2c6283238e7eeade33256808dbbe',
    # Sociology - Prizewinners
    '4f4eefbc67c748af2eafeeae3466bb9b5b23ebe1',
    '3e99c32b4e16d72ea22e9121a43b0b2795623da5',
    'bc6afc8a0406620d51bf897f5cd1114d0edf13b5',
    '1f6a9d39bd40e131983aaf03f384fd4f10151de8',
    '6586435c044fdf48a74e86dea02902dc235038cf',
    # Sociology - Citation Count
    '96131e374caf8b0b4cf7e205246d9feeb69d09a4',
    '6e29a07906289012958cfa8d05d466cea1f050a1',
    '61090d286c2b456efbacfbbc718b1365a80ad39d',
    'ee1f75f1978fb173cef9909aea610fa9513afc4f',
    'bd3346568eac5ff36de480128ec8ae051482ffe4'
]

### Helper Functions

In [70]:
# Extrahiere Category aus Dict
def extractCategory(row):
    cat_list = []
    if row != None:
        for value in row:
            cat_list.append(value.get('category'))
    
    return cat_list

In [71]:
# Extrahiere Authoren-Liste
def extractAuthorNames(row):
    author_list = []
    if row != None:
        for value in row:
            author_list.append(value.get('name'))

    return author_list

In [72]:
# Extrahiere AuthorenId-Liste
def extractAuthorIds(row):
    authorId_list = []
    if row != None:
        for value in row:
            authorId_list.append(value.get('authorId'))

    return authorId_list

In [73]:
# Extrahiere Affiliations-Liste
def extractAuthorAffiliations(row):
    affiliation_list = []
    if row != None:
        for value in row:
            affiliation_list.append(value.get('affiliations'))

    return affiliation_list

In [74]:
# Extrahiere Journalseiten, welche durch das Paper eingenommen werden
def extractOccupiedJournalPagesCount(journalPages):
    # Belegte Seiten des Journals in dem das Paper veröffentlicht wurde berechnen
    # Manche Angaben werden in ihrer Schreibweise verkürzt... z.B.: 153-57, dies soll wie 153-157 behandelt werden, da sonst negative Zahlen bei der folgenden Berechnung entstehen.

    try:
        split_pages = journalPages.strip().replace('–', '-').split('-')

        try:
            page_from = split_pages[0]
            page_to = split_pages[1]
            pagesOccupiedCount = int(page_to)-int(page_from)

            if pagesOccupiedCount<0:
                # Seitenzahl ist negativ (Seitenzahl von page_from war größer als Seitenzahl von page_to!)
                digit_delta = len(page_from)-len(page_to)
                # Z.B.: 133-38 wird geschaut aus wie vielen Zahlen beide aufgespaltenen Strings bestehen. Es wird die Länge verglichen, in dem Beispiel hat Zahl an [Index 0] 1 Stelle mehr als die Zahl an [Index 1].
                # Es wird nun von der Zahl an [Index 0] der Differenz entsprechend viele Zahlen von Links nach Rechts entnommen und an die Zahl an [Index 1] vorne angefügt.
                # So wird aus der Seitenzahl 38 nach diesem Muster [1]33 - [1]38 (die 1 der ersten Zahl wird vorne an die zweite Zahl angefügt).
                page_fixed = page_from[0:digit_delta]+page_to
                pagesOccupiedCount = int(page_fixed)-int(page_from)
                return pagesOccupiedCount
            else:
                return pagesOccupiedCount
                
        except:
            return 1

    except:
        return 0

### Get Metadata of papers in paperlist

In [75]:
cwd = os.path.abspath('./datasets/altmetric_yearly_top100/')

files = os.listdir(cwd)

altmetric_yearly_top100_df = pd.DataFrame()
for file in files:
    if file.endswith('.xlsx') or file.endswith('.xls'):
        altmetric_yearly_top100_df = altmetric_yearly_top100_df.append(pd.read_excel('./datasets/altmetric_yearly_top100/' + file).rename({'doi': 'DOI'}, axis=1), ignore_index=True)
    elif file.endswith('.csv'):
        altmetric_yearly_top100_df = altmetric_yearly_top100_df.append(pd.read_csv('./datasets/altmetric_yearly_top100/' + file, sep=',').rename({'doi': 'DOI'}, axis=1), ignore_index=True)

In [76]:
altmetric_yearly_top100_df = altmetric_yearly_top100_df.sample(325)

In [77]:
altmetric_list = altmetric_yearly_top100_df[(~(altmetric_yearly_top100_df['Altmetric Attention Score'].isna())) & (altmetric_yearly_top100_df['DOI']!=0) & (~altmetric_yearly_top100_df['DOI'].isna()) & (~altmetric_yearly_top100_df['DOI'].str.contains('http', na=False))]['DOI'].unique().tolist()

In [78]:
def getPaperMetadata(paper_list):
    start_ts = dtime.now().replace(microsecond=0)
    api_key = 'rGUKYOEpCP2FKQK88CLuB1izZBvDiQwA5SsSZ5vo'
    url = 'https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,referenceCount,citationCount,influentialCitationCount,externalIds,authors.name,fieldsOfStudy,s2FieldsOfStudy,year,journal'
    req = requests.post(url=url, json={'ids': paper_list}, params={'x-api-key': api_key}).json()

    try:
        df = pd.DataFrame(req)
    except:
        print(req)

    df['authorIds'] = df['authors'].apply(extractAuthorIds)
    df['authorsCount'] = df['authorIds'].apply(lambda x: len(x))
    df['authors'] = df['authors'].apply(extractAuthorNames)
    
    try:
        df.insert(1, column='DOI', value=df['externalIds'].apply(lambda x: x.get('DOI')), allow_duplicates = False)
    except:
        pass

    try:
        df.insert(2, column='CorpusId', value=df['externalIds'].apply(lambda x: x.get('CorpusId')), allow_duplicates = False)
    except:
        pass

    try:
        df['occupiedJournalPages'] = df['journal'].apply(lambda x: extractOccupiedJournalPagesCount(x.get('pages')))
    except:
        pass

    try:
        df['journalName'] = df['journal'].apply(lambda x: x.get('name'))
    except:
        pass

    try:
        df['s2FieldsOfStudy'] = df['s2FieldsOfStudy'].apply(lambda x: extractCategory(x))
    except:
        pass

    try:
        df['fieldsOfStudy'] = [ [] if x is None or x is np.NaN else x for x in df['fieldsOfStudy'] ]
    except:
        pass
    
    try:
        # Combine FoS & s2FoS
        df['fieldsOfStudy'] = df['fieldsOfStudy']+df['s2FieldsOfStudy']
    except:
        pass

    try:
        # Remove duplicates from the created list
        df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(lambda row: sorted(list(dict.fromkeys(row))))
    except:
        pass
    
    end_ts = dtime.now().replace(microsecond=0)
    print('Collected Metadata of ' + str(df.shape[0]) + ' scientific papers. [Time elapsed: ' + str(end_ts-start_ts) + ']')

    return df.drop(['externalIds', 's2FieldsOfStudy', 'journal'], axis=1)

### Erweitern des Datensatzes mit Metadaten von Papern aus dem Quellenverzeichnis der "Handpicked-Paper"

In [79]:
def getReferenceDataFrame(paperId):
    api_key = 'rGUKYOEpCP2FKQK88CLuB1izZBvDiQwA5SsSZ5vo'
    offset = 0
    df = pd.DataFrame()
    start_ts = dtime.now().replace(microsecond=0)

    # Get References Data
    while offset != None:
        try:
            # print('Offset: '+str(offset))
            url = "https://api.semanticscholar.org/graph/v1/paper/"+paperId.replace(' ','').strip()+"/references?fields=title,referenceCount,citationCount,influentialCitationCount,isInfluential,fieldsOfStudy,s2FieldsOfStudy,year,journal,authors,externalIds&limit=1000&offset="+str(offset)
            req = requests.get(url=url, params={'x-api-key': api_key}).json()
        
            # Wait 5 Minutes and retry if API-Server sends 'Too Many Requests'
            while req.get('message') == 'Too Many Requests':
                print('[Too Many Requests]: Waiting 5 minutes.')
                time.sleep((60*5)+5) # Wait 5 Minutes + 5 Extra-seconds for tolerance
                url = "https://api.semanticscholar.org/graph/v1/paper/"+paperId.replace(' ','').strip()+"/references?fields=title,referenceCount,citationCount,influentialCitationCount,isInfluential,fieldsOfStudy,s2FieldsOfStudy,year,journal,authors,externalIds&limit=1000&offset="+str(offset)
                req = requests.get(url=url, params={'x-api-key': api_key}).json()
            
            df = pd.concat([df, pd.DataFrame(req.get('data'))])
            offset = req.get('next')
        except Exception as e:
            # print('An error was returned from the API-Server while querying the paper with paperId: ' + str(paperId) + '.. Proceeding with next row.')
            print(e)
    try:
        try:
            df = df[['isInfluential']].reset_index(drop=True).join(pd.json_normalize(df['citedPaper']))
        except:
            # No References found.
            return
            # df = pd.json_normalize(df['citedPaper'])

        # Remove rows with missing Ids
        if df[df['paperId'].isna()].shape[0]>0:
            # missing_row_count = df[df['paperId'].isna()].shape[0]
            # print('Removed ' + str(missing_row_count) + ' rows which had missing paperId and external Ids.')
            df = df[~df['paperId'].isna()]

        df['s2FieldsOfStudy'] = df['s2FieldsOfStudy'].apply(lambda x: extractCategory(x))

        # df['fieldsOfStudy'] = [ [] if x is None else x for x in df['fieldsOfStudy'] ] + df['s2FieldsOfStudy']
        df['fieldsOfStudy'] = [ [] if x is None or x is np.NaN else x for x in df['fieldsOfStudy'] ]

        # Combine foS & s2FoS
        df['fieldsOfStudy'] = df['fieldsOfStudy']+df['s2FieldsOfStudy']

        # Remove duplicates from the created list
        df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(lambda row: sorted(list(dict.fromkeys(row))))

        # Belegte Seiten des Journals in dem das Paper veröffentlicht wurde berechnen
        try:
            # Manche Angaben werden in ihrer Schreibweise verkürzt... z.B.: 153-57, dies soll wie 153-157 behandelt werden, da sonst negative Zahlen bei der folgenden Berechnung entstehen.
            df['occupiedJournalPages'] = df['journal.pages'].apply(lambda journalPages: extractOccupiedJournalPagesCount(journalPages))
        except:
            df['occupiedJournalPages'] = 0

        # Author Name extrahieren
        df['authorsCount'] = df['authors'].apply(lambda x: len(x))
        df['authorIds'] = df['authors'].apply(extractAuthorIds)
        df['authors'] = df['authors'].apply(extractAuthorNames)

        # Author Affiliation extrahieren
        # df['authors_affiliations'] = df['authors'].apply(lambda x: extractAuthorAffiliations(x))

        # Dtype conversion
        # df['year'] = df['year'].astype(int)
        df['referenceCount'] = df['referenceCount'].astype(int)
        df['citationCount'] = df['citationCount'].astype(int)
        df['influentialCitationCount'] = df['influentialCitationCount'].astype(int)

        # if missing_row_count>0:
        #     print('Collected ' + str(df.shape[0]) + ' row(s), removed ' + str(missing_row_count) + ' row(s) due to missing Id. (For Details check: www.semanticscholar.org/paper/' + paperId + ') [Time elapsed: ' + str((end_ts-start_ts)) + ']')
        # else:
        #     print('Collected ' + str(df.shape[0]) + ' row(s). [Time elapsed: ' + str((end_ts-start_ts)) + ']')

        df.insert(0, 'Ref_paperId', paperId)
        try:
            df.insert(loc=1, column='DOI', value=df['externalIds.DOI'])
        except:
            pass

        for col in df.columns:
            if 'externalIds' in col:
                df.drop(col, axis=1, inplace=True)
        
        potential_cols_to_be_dropped = ['isInfluential', 's2FieldsOfStudy','publicationVenue.id', 'publicationVenue.issn', 'publicationVenue.name',	'publicationVenue.type', 'publicationVenue.alternate_issns', 'publicationVenue.url', 'publicationVenue.alternate_urls', 'journal.volume', 'journal.pages', 'journal', 'publicationVenue.alternate_names', 'publicationVenue']
        for col in potential_cols_to_be_dropped:
            try:
                df.drop([col], axis=1, inplace=True)
            except:
                pass
            
        end_ts = dtime.now().replace(microsecond=0)
        print('Collected Metadata of ' + str(df.shape[0]) + ' scientific papers. [Time elapsed: ' + str(end_ts-start_ts) + ']')

        return df.rename({'journal.name': 'journalName'}, axis=1)

    except Exception as e:
        print(e)

### Erweitern der Datensätze (basierend auf Quellenverzeichnis der handpicked-papers + handpicked-papers selbst) um Autoren Infos

In [80]:
def getAuthorData(author_list):
    try:
        api_key = 'rGUKYOEpCP2FKQK88CLuB1izZBvDiQwA5SsSZ5vo'
        url = 'https://api.semanticscholar.org/graph/v1/author/batch?fields=name,paperCount,citationCount,hIndex'
        req = requests.post(url=url, json={'ids': author_list}, params={'x-api-key': api_key})

        # Wait 5 Minutes and retry if API-Server sends 'Too Many Requests'
        while req.status_code == 429: # Too Many Requests
            print('[Too Many Requests]: Waiting 5 minutes.')
            time.sleep((60*5)+5) # Wait 5 Minutes + 5 Extra-seconds for tolerance
            req = requests.post(url=url, json={'ids': author_list}, params={'x-api-key': api_key})
        
        try:
            df = pd.DataFrame(req.json())
            return df
        except Exception as e:
            print(e)

    except Exception as e:
        print(e)
        return pd.DataFrame()


### __Get the Data__

In [81]:
influential_papers_list = hamza_paper_list+maurice_paper_list+ramses_paper_list+altmetric_list

__Get Semantic Scholar Paper and References Data__

In [82]:
if os.path.exists('./exports/ss_df.csv'):
    # Sicherungskopie einlesen, falls der Code nach der Abfrage der Referenzen abgestürzt sein sollte.
    ss_df = pd.read_csv('./exports/ss_df.csv', sep=',')
    print('Loaded ss_df from disk. (' + str(ss_df.shape[0]) + ' rows; ' + str(ss_df.shape[1]) + ' columns)')
else:
    try:
            ss_df = getPaperMetadata(influential_papers_list)
            print('Collecting Semantic Scholar Reference Data...')
            reference_df = pd.DataFrame()
            start_ts = dtime.now().replace(microsecond=0)
            for idx, paper in enumerate(influential_papers_list):
                # print('[' + str(idx+1) + '] ------------------------ Paper: ' + str(paper) + ' ------------------------')
                reference_df = pd.concat([reference_df, getReferenceDataFrame(paper)])
            reference_df = reference_df.drop_duplicates(subset=['paperId'])
            end_ts = dtime.now().replace(microsecond=0)
            print('- Colleted ' + str(reference_df.shape[0]) + ' paper(s). [Time elapsed: ' + str((end_ts-start_ts)) + ']')
    except Exception as e:
        print('- Failed to collect Semantic Scholar References metadata.')

    ss_df = ss_df.append(reference_df)
    ss_df.insert(0, 'Ref_paperId', ss_df.pop('Ref_paperId'))
    print('- Appended Reference DataFrame to "ss_df" for future processing.')

    if os.path.exists('./exports/') == False:
        os.mkdir('./exports/')

    # Remove rows with missing DOIs since it's impossible to query other APIs like Altmetric for these cases
    ss_df.dropna(subset=['DOI'], inplace=True)

    # Remove rows with duplicate subsets of paperId and DOI since it can be possible for multiple handpicked-papers cite the same paper as one of their sources
    ss_df.drop_duplicates(['paperId', 'DOI'], keep='first', inplace=True)

    ss_df.to_csv('./exports/ss_df.csv', encoding='utf-8-sig', index=False)

Loaded ss_df from disk. (19508 rows; 15 columns)


__Get Altmetric Data__

In [83]:
if os.path.exists('./exports/altmetric_df.csv'):
    # Sicherungskopie einlesen, falls der Code nach der Abfrage der Referenzen abgestürzt sein sollte.
    altmetric_df = pd.read_csv('./exports/altmetric_df.csv', sep=',')
    print('Loaded altmetric_df from disk. (rows: ' + str(altmetric_df.shape[0]) + '; columns: ' + str(altmetric_df.shape[1]) + ')')

    # Before Merging make DOIs uppercase
    ss_df['DOI'] = ss_df['DOI'].str.upper()
    altmetric_df['altmetric_doi'] = altmetric_df['altmetric_doi'].str.upper()

    # Merge Altmetric with Semantic Scholar Reference Data
    ss_df = ss_df.merge(altmetric_df, how='left', left_on='DOI', right_on='altmetric_doi').drop(['altmetric_doi'], axis=1)
else:
    try:
        print('- Collecting Altmetric Data...')
        start_ts = dtime.now().replace(microsecond=0)
        altmetric_list = []
        for idx, paper in enumerate(ss_df[~ss_df['DOI'].isna()]['DOI'].unique()):
            print('[' + str(idx+1) + '] ---------- Collecting data for <' + str(paper) + '> ----------')
            altmetric_list = altmetric_list+[getAMdata(paper)]
        altmetric_df = pd.DataFrame([i for i in altmetric_list if i is not None])
        end_ts = dtime.now().replace(microsecond=0)
        print('- Colleted ' + str(altmetric_df.shape[0]) + ' paper(s) Altmetric data. [Time elapsed: ' + str((end_ts-start_ts)) + ']')
        
        # Export Altmetric_df to csv
        altmetric_df.to_csv('./exports/altmetric_df.csv', encoding='utf-8-sig', index=False)

        # Before Merging make DOIs uppercase
        ss_df['DOI'] = ss_df['DOI'].str.upper()
        altmetric_df['altmetric_doi'] = altmetric_df['altmetric_doi'].str.upper()

        # Merge Altmetric with Semantic Scholar Reference Data
        ss_df = ss_df.merge(altmetric_df, how='left', left_on='DOI', right_on='altmetric_doi').drop(['altmetric_doi'], axis=1)
    except:
        print('Failed to collect Altmetric data.')

    # Drop potential duplicates
    ss_df.drop_duplicates(['paperId', 'DOI'], inplace=True)

try:
    # Convert String-Column to List
    ss_df['authorIds'] = ss_df['authorIds'].apply(literal_eval)
except:
    pass

Loaded altmetric_df from disk. (rows: 12607; columns: 23)


__Get Authors Data__

In [84]:
if os.path.exists('./exports/authors_df.csv'):
    # Sicherungskopie einlesen, falls der Code nach der Abfrage der Referenzen abgestürzt sein sollte.
    authors_df = pd.read_csv('./exports/authors_df.csv', sep=',')
    print('Loaded authors_df from disk. (rows: ' + str(authors_df.shape[0]) + '; columns: ' + str(authors_df.shape[1]) + ')')
else:
    try:
        print('- Collecting Semantic Scholar Author Data...')
        authors_df = pd.DataFrame()
        # Tell Pandas to recognize 'authorIds' as List, otherwise it will be handled as if it was a String and then the following code snippet won't work
        try:
            ss_df['authorIds'] = ss_df['authorIds'].apply(literal_eval)
        except:
            pass
        ssAuthors = ss_df['authorIds'].explode().unique().tolist()
        # Remove missing values from list
        ssAuthors = [x for x in ssAuthors if x not in [None, np.nan, 'nan']]

        for authors in range(0, len(ssAuthors), 1000):
            authors_df = authors_df.append(getAuthorData(ssAuthors[authors:authors+1000]))
            print('Collected Authors [' + str(authors_df.shape[0]) + '/' + str(len(ssAuthors)) + ']')
        
        # try:
        #     print('Checking if the author is a Nobel Prize laureate.')
        #     authors_df['isNobelPrizeLaureate'] = authors_df['name'].apply(isNobelPrizeLaureate)
        # except:
        #     print('Error occured checking Nobel Prize Laureates.')

        authors_df.to_csv('./exports/authors_df.csv', encoding='utf-8-sig', index=False)
    except Exception as e:
        print(e)
        print('Failed to collect author data from Semanticscholar.org')

authors_df['authorId'] = authors_df['authorId'].astype(str)

Loaded authors_df from disk. (rows: 67364; columns: 5)


In [85]:
authors_df.sample(5)

Unnamed: 0,authorId,name,paperCount,citationCount,hIndex
57672,1892901,T. Agócs,75,1963,14
55007,101130934,K. Soroka,3,175,2
15975,39099576,T. Omata,182,3395,29
58211,1403427003,Z. Saz-Parkinson,66,988,15
40692,8391563,V. Smolčić,188,14934,54


__Calculate aggregated Author KPIs__

In [86]:
# .explore() macht aus einer Liste mit Werten neue separate Zeilen [1, 2, 3] ->  drei separate Zeilen, die bis auf die für .explode() genutzte Spalte identisch sind.
ss_df_exploded = ss_df.explode('authorIds')

In [87]:
# Semantic Scholar DataFrame mit Authoren DataFrame mergen
ss_df_exploded = ss_df_exploded[['paperId', 'authorIds']].merge(authors_df.rename({'paperCount':'authors_sum_paperCount', 'citationCount':'authors_sum_citationCount', 'hIndex':'authors_sum_hIndex'}, axis=1), how='left', left_on=['authorIds'], right_on=['authorId']).sort_values(by=['paperId']).drop(['authorIds', 'name'], axis=1)

In [88]:
# Eventuelle Duplikate entfernen
ss_df_exploded.drop_duplicates(subset=['paperId', 'authorId'], inplace=True)

In [89]:
ss_df_exploded[ss_df_exploded['paperId']=='e932c974074cc4a38f02fa7d674932eef81faf61']

Unnamed: 0,paperId,authorId,authors_sum_paperCount,authors_sum_citationCount,authors_sum_hIndex
2276,e932c974074cc4a38f02fa7d674932eef81faf61,36933375,142.0,12784.0,58.0
2277,e932c974074cc4a38f02fa7d674932eef81faf61,50455279,187.0,8791.0,50.0
2278,e932c974074cc4a38f02fa7d674932eef81faf61,2106288317,17.0,772.0,7.0


In [90]:
# Author KPIs auf Basis der paperId aggregieren
ss_df_exploded = ss_df_exploded[['paperId', 'authors_sum_paperCount', 'authors_sum_citationCount', 'authors_sum_hIndex']].groupby(['paperId']).sum().reset_index()

In [91]:
ss_df_exploded[ss_df_exploded['paperId']=='e932c974074cc4a38f02fa7d674932eef81faf61']

Unnamed: 0,paperId,authors_sum_paperCount,authors_sum_citationCount,authors_sum_hIndex
17821,e932c974074cc4a38f02fa7d674932eef81faf61,346.0,22347.0,115.0


In [92]:
# Aggregierte Author KPIs wieder zum ss_df zurückführen
ss_df = ss_df.merge(ss_df_exploded, how='left', on=['paperId'])

In [93]:
# Count number of Nobel Prize Laureates which contributed to a specific scientific paper and append column
ss_df['nobelPrizeLaureatesCount'] = ss_df['authors'].apply(getNobelPrizeLaureateCount)

In [94]:
ss_df.shape

(19539, 41)

In [95]:
if os.path.exists('./exports/') == False:
    os.mkdir('./exports/')

In [96]:
ss_df.to_csv('./exports/ss_df_complete.csv', encoding='utf-8-sig', index=False)

__Prepare DataFrame for ML__

In [97]:
# Create DataFrame for ML with only relevant KPIs
ss_df_reduced = ss_df[['paperId', 'referenceCount', 'citationCount', 'authorsCount', 'occupiedJournalPages', 'authors_sum_paperCount', 'authors_sum_citationCount', 'authors_sum_hIndex', 'altmetric_score', 'influentialCitationCount']]

In [98]:
# Fill Missing Values with 0
ss_df_reduced = ss_df_reduced.fillna(0)

In [99]:
# Typecasting
ss_df_reduced.iloc[:, 1:] = ss_df_reduced.iloc[:, 1:].astype(int)

In [102]:
ss_df_reduced

Unnamed: 0,paperId,referenceCount,citationCount,authorsCount,occupiedJournalPages,authors_sum_paperCount,authors_sum_citationCount,authors_sum_hIndex,altmetric_score,influentialCitationCount
0,c5d7b0fafdd72f57f2b0bfdd0ce3608a2528b666,0,14055,1,0,410,64780,105,0,302
1,eaa7d6df21e3a097ab902e19ff2f2905aafb7ab3,174,2802,2,0,530,41428,126,3,332
2,79b4b48c6481becde78bd6287386d6507dac27b1,107,2477,2,0,709,81901,189,29,319
3,e87c3be7aaa1c7ce91f94fda0815339ac02af787,103,5112,2,0,139,55855,67,26,554
4,d224a1027b47cece2982c400162e053192423b1b,41,5737,6,0,3487,192943,392,43,92
...,...,...,...,...,...,...,...,...,...,...
19534,209575bae1041c8be7a2c3f3ff38ab19065d277d,51,339,4,9,138,7533,54,13,26
19535,6b21acc0d164190b9111da7cafd7630f70490c7b,28,868,1,3,86,12291,43,418,98
19536,7357771111c6160e44a98174ee19e04c3bdd81a5,22,229,6,5,880,27901,164,29,10
19537,14f2cdbf971aa34b7d6a911fb0fbdf05c4474b3f,30,39,1,3,144,7963,53,4,0


In [103]:
# Export für Dashboard
ss_df_reduced.to_csv('./exports/ss_df_reduced.csv', encoding='utf-8-sig', index=False)

In [None]:
ss_df_reduced

In [None]:
# Normalisierung mit Min-Max-Scaler
from sklearn.preprocessing import MinMaxScaler

normScaler = MinMaxScaler(feature_range=(0, 100))

In [None]:
# Transform 'ml_df' with Min-Max-Scaler
ss_df_normalized = pd.DataFrame(normScaler.fit_transform(ss_df_reduced.drop(['paperId'], axis=1)))
ss_df_normalized

In [None]:
# Rename columns
ss_df_normalized.columns = ss_df_reduced.columns[1:]

In [None]:
# Test if column contains min of 0 and max of 100
ss_df_normalized['citationCount'].min(), ss_df_normalized['citationCount'].max()

In [None]:
# Join back paperId to normalized df
ss_df_normalized = ss_df_reduced[['paperId']].join(ss_df_normalized)

### __Gewichtung berechnen__

In [None]:
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split

In [None]:
# Train/Test-Split
train_data, test_data = train_test_split(ss_df_reduced.drop(['paperId'], axis=1),test_size=0.2,random_state=42)
train_data.shape, test_data.shape

In [None]:
# Tabular Predictor laden und Speicherpfad einstellen
save_path = './trained_models/weights'

if os.path.exists(save_path):
    predictor = TabularPredictor.load(save_path)
    print('Loaded model from disk.')
else:
    predictor = TabularPredictor(label='influentialCitationCount', path=save_path).fit(train_data, verbosity=0, presets='medium_quality')

In [None]:
# Permutation Feature Importance ausgeben
feature_importance = predictor.feature_importance(data=test_data)[['importance']]

In [None]:
# Anteilsmäßigen Einfluss der Features berechnen
feature_importance['total_share'] = round(abs(feature_importance['importance']/feature_importance['importance'].sum()), 3)

In [None]:
weight_df = feature_importance[['total_share']].rename({'total_share':'Gewichtung'}, axis=1).T

In [None]:
weight_df

In [None]:
# Export Weight-Table for Dashboard
weight_df.to_csv('./exports/weights.csv', encoding='utf-8-sig', index=False)

In [None]:
weight_df