# Install packages

In [17]:
!pip install pandas          # for rectangular data manipulation
!pip install pybliometrics   # wrapper for Scopus API
!pip install networkx        # network stats 
!pip install matplotlib      # visualization
!pip install nltk            # natural language processing tools
!pip install scipy           # more stats tools
!pip install seaborn         # more visualization

Collecting seaborn
  Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)
     |████████████████████████████████| 292 kB 4.7 MB/s            
Installing collected packages: seaborn
Successfully installed seaborn-0.11.2


# Load packages and modules, check access

Scopus API access is regulated by two layers of authentication, so the following may not work, even with the appropriate personal API key.

In [19]:
import scipy
import seaborn
from itertools import combinations
import networkx as nx
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
from string import digits, punctuation

import nltk
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.decomposition import LatentDirichletAllocation as LDA

from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.decomposition import LatentDirichletAllocation as LDA

If authorization succeeds, the following block should give the query time for a search of all of _American Journal of Sociology_ in 2010.

In [20]:
from pybliometrics.scopus import ScopusSearch # bread-and-butter tool from pybib

start = datetime.now().replace(microsecond = 0)
s = ScopusSearch("ISSN (0002-9602) AND PUBYEAR IS 2010")
end = datetime.now().replace(microsecond = 0)
Print (end - start)
Print (len(s.result))

Scopus401Error: The requestor is not authorized to access the requested view or fields of the resource

# Network graph of a single paper

This toy example should produce a simple graph of the co-authors of a single paper. But first, check the number of hits from a search for the title as a string:

In [None]:
s = ScopusSearch('The Proliferation of Criminal Background Check Laws in the United States')
print(len(s.result))

Now extract the authors from the search object, and define co-authorship as edges in a graph.

In [8]:
Authors = [i.author_ids.split(';') for i in s.results]
combs = [list (combinations(i,2)) for i in authors]
edges = [i for j in combs for i in j]
G = nx. Graph()
G.add_edges_from(edges)
Print (nx.info(G))
nx.draw(G, node_size=2)
plt.savefig('netword1.pdf', bbox_inches= 'tight', figsize= (50,50))

Scopus401Error: The requestor is not authorized to access the requested view or fields of the resource

## Another network graph example

In [9]:
s = ScopusSearch('Class Mobility and Political Preferences: Individual and Contextual Effects')
authors =[i.author_ids.split(';') for i in s.results]
combs = [list (combinations(i,2)) for i in authors]
edges = [i for j in combs for i in j]
G = nx. Graph()
G.add_edges_from(edges)
print (nx.info(G))
nx.draw(G, node_size=2)
plt.savefig('netword2.pdf', bbox_inches= 'tight', figsize= (50,50))

Scopus401Error: The requestor is not authorized to access the requested view or fields of the resource

# Citation analysis and visualization

In [10]:
from pybliometrics.scopus import AbstractRetrieval, CitationOverview 

##Save yearly citation counts in DataFrame
data = pd.DataFrame()
for eid in ('2-s2.0-84900013243','2-s2.0-84977703147','2-s2.084977736029'):
    ab = AbstractRetrieval(eid)                         # abstract retrieval for finding dates
    year = ab.coverDate[:4]                             # date flag for abstracts
    co = CitationOverview(eid, start = year)            # citation overview
    data = data.append(pd.Series(dict(co.cc), name = ab.title))
    
#Preparing data for plotting
data = data.T.astype(float)
data.index.name = 'year'
data = data.reset_index()

## Plot and save
fig,ax = plt.subplots(figsize = (15,7))
melted = data.melt(id_vars = 'year', var_name = 'paper', value_name = 'citatuons')
sns.barplot(data = melted, x= 'year', y = 'citation', hue = 'paper', ax = ax)
Plt.savefig ('citations.pdf', bbox_inches = 'tight')

###KL-smilarity of distributions
scipy.stats.ks_2samp(data,iloc[0: ,0], data.iloc[: ,1])

Scopus401Error: The requestor is not authorized to access the requested view or fields of the resource

# LDA on Abstracts

Note that the AbstractRetrieval tool is not necessary here, since abstracts are part of the "description" in search results.

In [11]:
s = ScopusSearch('SOURCE-ID(21100422153)')
abstracts = [i.description for i in s.results if i.description]
print(len(abstracts))

stemmer = nltk.snowball.SnowballStemmer('english')
def tokensize_and_stem(text):
    text = text.translate(str.maketrans({p: "" for p in punctuation + digits + "@"}))
    return [stemmer.stem(t) for t in nltk.word_tokensize(text.lower())]

params = {'stop_words': "english", "tokensizer": tokensize_and_stem, "ngram_range":(1,2), "max_df": 0.6}
vectorizer = CV(**params)
matrix = vectorizer.fit_transform(abstarcts)
terms = vectorizer.get_feature_names()
print (len(terms))

lda = LDA(n_components= 3).fit(matrix)
for topic in lda.components:
    text = ",".join(term[i] for i in topic.argsort() [: -10:-1])
    print (text)

ModuleNotFoundError: No module named 'nltk'