In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from matplotlib import pyplot as plt
import warnings
import spacy
import swifter
from tqdm import tqdm
import spacy

# Cytowania artykułów naukowych z dziedziny AI w dokumentach legislacyjnych - EDA


### Zbiór danych

>**S2ORC** is a general-purpose corpus for NLP and text mining research over scientific papers. We've curated a unified resource that combines aspects of **citation graphs** (i.e. rich paper metadata, abstracts, citation edges) [...]. Corpus covers 136M+ paper nodes with 12.7M+ full text papers and connected by 467M+ citation edges by unifying data from many different sources covering many different academic disciplines and identifying open-access papers using services.
*https://github.com/allenai/s2orc/*


<img src="img/Altmetric_logo.png" width="200">
<img src="img/openalex-logo.png" width="200">

In [None]:
df=pd.read_csv('data/df_all.csv')
df

In [None]:
df.columns

In [None]:
df.sort_values(["score"], ascending=False)[["title", "score"]].head(5)

In [None]:
df["mag_field_of_study"]

In [None]:
df["mag_field_of_study"]=df["mag_field_of_study"].fillna("NNonee")  #xdddd
tmp=[x[1:len(x)-1] for x in df["mag_field_of_study"]]
tmp=[x.replace("'", "") for x in tmp]
tmp=[x.split(",") for x in tmp]
df["mag_field_of_study"]=tmp
categories=[]
for i in df["mag_field_of_study"]:
    for j in i:
        categories.append(j)

categories=set([x.replace(" ","") for x in categories])
categories=[re.sub(r"(\w)([A-Z])", r"\1 \2", x) for x in categories]

df[categories]=0
for i in range(len(df)):
    for k in categories:
        if k in df["mag_field_of_study"][i]:
            df[k][i]=1

### Wstępna analiza

##### Ilość artykułów a dziedzina

In [None]:
plt.figure(figsize=(8,6))
sums=[sum(df[x]) for x in categories]
ax = sns.barplot(categories, y=sums)
plt.xticks(rotation=90);
plt.title("Number of articles by field", size=15);

##### Procent cytowanych artykułów a dziedzina

In [None]:
df["cited_by_policies_count"].hist()
plt.yscale('log')
plt.title("cited_by_policies_count distribution");

In [None]:
for i in df[df["cited_by_policies_count"]>15]["title"]:
    print(i)
percs=[]
for i in range(len(categories)):
    c=categories[i]
    percs.append(len(df.loc[(df["cited_by_policies_count"]>0) & (df[c]==1)])/sums[i])
plt.figure(figsize=(8,6))
ax = sns.barplot(categories, percs)
plt.xticks(rotation=90);
plt.title("Percentage of articles cited in policies by field", size=15);

#### Cytowania na Twitterze

In [None]:
df["cited_by_tweeters_count"].hist()
plt.yscale('log')
plt.title("cited_by_tweets_count distribution");

In [None]:
df_twt=df[df["cited_by_tweeters_count"]<250]
AverageTweets=[sum(df_twt.loc[df_twt[x]==1]["cited_by_tweeters_count"])/len(df_twt.loc[df_twt[x]==1]) for x in categories]
plt.figure(figsize=(8,6))
ax = sns.barplot(categories, AverageTweets)
plt.xticks(rotation=90);
plt.title("Average number of Tweeter citations by field", size=15);

In [None]:
df[df["Art"]==1]["title"]

#### Długość abstraktów

In [None]:
df_abs=df[df["abstract"].notna()]
df_abs["len_abstract"]=df_abs["abstract"].str.len()
df_abs["len_abstract"].hist()

#### Macierz korelacji

In [None]:
df_cont=df_abs[['cited_by_posts_count',
       'cited_by_tweeters_count', 'cited_by_policies_count', 'readers_count',
       'score', "len_abstract"]]
plt.figure(figsize=(8,6))
ax=sns.heatmap(df_cont.corr(), annot=True)

#### Najczęstsze nazwy własne

In [None]:
from spacy.tokens import DocBin
en = spacy.load('en_core_web_sm')
#df['doc'] = df['abstract'].swifter.progress_bar(True).apply(en)
doc_bin = DocBin().from_disk("data/df_all.spacy")
docs = list(doc_bin.get_docs(en.vocab))
df['doc'] = np.array(docs)

In [None]:
df_cited=df[df["cited_by_policies_count"]>0]

In [None]:
stop_words_ents=["ROC", "OA", "MODIS", "MR", "J48", "DR", "NC", "MD", 
"TF","Random Forests", "Multilayer Perceptron", "F1", "Random", "Gradient Boosting", "Kernel", 
"Gabor", "Background"]

In [None]:
gpe=[]
org=[]
norp=[]
person=[]
for i in df['doc']:
    for j in i.ents:
        if str(j) not in stop_words_ents:
            if j.label_=="GPE":
                    gpe.append(str(j))
            if j.label_=="ORG":
                org.append(str(j))
            if j.label_=="NORP":
                norp.append(str(j))
            if j.label_=="PERSON":
                person.append(str(j))

In [None]:
gpe_c=[]
org_c=[]
norp_c=[]
person_c=[]
for i in df_cited['doc']:
    for j in i.ents:
        if str(j) not in stop_words_ents:
            if j.label_=="GPE":
                    gpe_c.append(str(j))
            if j.label_=="ORG":
                org_c.append(str(j))
            if j.label_=="NORP":
                norp_c.append(str(j))
            if j.label_=="PERSON":
                person_c.append(str(j))

In [None]:
from collections import Counter
com_gpe_all=Counter(list(gpe)).most_common(10)
com_gpe_cited=Counter(list(gpe_c)).most_common(10)
com_org_all=Counter(list(org)).most_common(10)
com_org_cited=Counter(list(org_c)).most_common(10)
com_per_all=Counter(list(person)).most_common(10)
com_per_cited=Counter(list(person_c)).most_common(10)

In [None]:
pd.DataFrame({"gpe_all":com_gpe_all, "gpe_cited":com_gpe_cited, "person_all":com_per_all, "person_cited":com_per_cited,
 "org_all":com_org_all, "org_cited":com_org_cited, })

### Rozbicie danych na kategorie i ich udział w cytowanych artykułach

Poniżej jest wykonana analogiczna analiza jak w artykule L. Bornmann *"Policy documents as sources for measuring societal impact"*[1]. Polega ona na porównaniu udziału danej kategorii względem całego zbioru oraz zbioru ograniczonego tylko do artykułów cytowanych w dokumentach legislacyjnych.
<img src="img/screenshot1.png" width="800">

In [None]:
def create_breakdown(df, break_column, filter_out_small_n=False):
    df_cited = df[df['cited_by_policies_count'] >0]
    out_df = df.groupby([break_column])['title'].count().sort_values(ascending=False).to_frame('all_n')
    out_df['all_share'] = out_df['all_n']/out_df['all_n'].sum()
    cited = df_cited.groupby([break_column])['title'].count().sort_values(ascending=False).to_frame('cited_n')
    out_df = out_df.join(cited)
    out_df = out_df.fillna(0)
    out_df['cited_share'] = out_df['cited_n'] / out_df['cited_n'].sum()
    out_df['share_diff_percent'] = out_df['cited_share'] - out_df['all_share']
    out_df['share_diff_percent'] = np.round_(out_df['share_diff_percent']*100, 2)
    if filter_out_small_n:
        out_df = out_df[out_df['all_n'] > 10]
    return out_df

In [None]:
create_breakdown(df, 'type')

In [None]:
create_breakdown(df, 'publisher', filter_out_small_n=True)

In [None]:
import ast
df.mag_field_of_study = df.mag_field_of_study.fillna(0).apply(lambda x: "['unknown']" if x==0 else x)

create_breakdown(df.explode('mag_field_of_study'), 'mag_field_of_study', filter_out_small_n=True)

In [None]:
df = pd.read_pickle('data/df_all.pkl')
create_breakdown(df.explode('institutions'), 'institutions', filter_out_small_n=True)

In [None]:
create_breakdown(df.explode('institutions'), 'institutions', filter_out_small_n=True).sort_values('share_diff_percent', ascending=False)

### Topic modeling

In [None]:
import pandas as pd
import plotly.express as px
import swifter
from tqdm import tqdm
import spacy
import joblib
from wordcloud import WordCloud
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models
import numpy as np
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

pyLDAvis.enable_notebook()

en = spacy.load('en_core_web_sm')

from spacy.tokens import DocBin
#df['doc'] = df['abstract'].swifter.progress_bar(True).apply(en)
doc_bin = DocBin().from_disk("data/df_all.spacy")
docs = list(doc_bin.get_docs(en.vocab))
df['doc'] = np.array(docs)



In [None]:
custom_stopwords = ['result', 'feature', 'base', 'learning', 'model', 'datum',
 'method','propose','classification', 'algorithm', 'approach', 'accuracy', 'dataset',
 'different', 'study', 'paper', 'predict', 'high', 'learn', 'use', 'task', 'training',
 'present', 'compare', 'large', 'technique', 'high', 'time', 'set', 'machine', 'test',
 'problem', 'apply', 'new', 'identify', 'train', 'develop', 'good', 'level', 'image', 'prediction', 'classifier', 'performance', 'SVM', 'system',
 'Support', 'information','Vector', 'area', 'achieve', 'demonstrate', 'show', 'Random', 'provide',
 'Machine', 'sample', 'class', 'application', 'obtain', 'include', 'process', 'evaluate',
 'work', 'well', 'obtain', 'non', 'analysis', 'Machine', 'sample', 'improve', 'perform',
 'network', 'number', 'domain', 'target', 'label','find', 'novel', 'low', 'experiment', 'tool', 'value']

In [None]:
from collections import Counter
df['lemmas'] = df['doc'].apply(lambda doc: [t.lemma_ for t in doc if t.is_alpha if not t.is_stop if not t.is_punct if t.ent_type==0 if t.lemma_ not in custom_stopwords])
dictionary = Dictionary(df['lemmas'])
encoded_docs = df['lemmas'].swifter.apply(dictionary.doc2bow)
word_counts = Counter(df['lemmas'].sum())

In [None]:
wc = WordCloud(width=800, height=400)
wc.generate_from_frequencies(frequencies=word_counts)
plt.figure(figsize=(10,8))
plt.imshow(wc)

In [None]:
from gensim.models import CoherenceModel
n_topics_options = range(3, 10, 2)
models = []
for topics_number in n_topics_options:
    lda = LdaMulticore(encoded_docs, num_topics=topics_number)
    models.append(lda)
cvs = []
for model in models:
    cm = CoherenceModel(model,texts=df['lemmas'], dictionary=dictionary)
    c_v = cm.get_coherence()
    cvs.append(c_v)
    

px.line(x=n_topics_options, y=cvs)

In [None]:
lda = LdaMulticore(encoded_docs, num_topics=7, passes=10)
vis = pyLDAvis.gensim_models.prepare(lda, encoded_docs, dictionary=dictionary)
vis

In [None]:
pyLDAvis.save_html(vis, 'lda.html')

### Bibliografia

[1]L. Bornmann, R. Haunschild, and W. Marx, ‘Policy documents as sources for measuring societal impact: how often is climate change research mentioned in policy-related documents?’, Scientometrics, vol. 109, no. 3, pp. 1477–1495, Dec. 2016, doi: 10.1007/s11192-016-2115-y.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=429117f5-ba6c-4f05-a14a-fe89a5192802' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>