In [None]:
# Importing of libraries
import pandas as pd
import json
from datetime import datetime as dt
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from textblob.en.np_extractors import FastNPExtractor
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import csv
import pickle
from wordcloud import WordCloud

# List all files in the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Overview of the data
import pandas as pd

data = pd.read_json('../input/arxiv/arxiv-metadata-oai-snapshot.json', lines=True, nrows=1)
data.head()

In [None]:
# Detail view of the publication dates (v1 = Publication Date)
print(data['versions'])

In [None]:
# Loading of the complete dataset filtered by necessary attributes
data = {'title': [], 'categories': [], 'abstract': [], 'publication_date': []}

with open('../input/arxiv/arxiv-metadata-oai-snapshot.json') as f:
    for line in f:
        data['title'].append(json.loads(line)['title'])
        data['categories'].append(json.loads(line)['categories'])
        data['abstract'].append(json.loads(line)['abstract'])
        data['publication_date'].append(dt.strptime(":".join(json.loads(line)['versions'][0]['created'][5:16].split(" ")[0:3]), '%d:%b:%Y').date())

# Conversion of the dictionary into a dataframe
df = pd.DataFrame(data)

# Check for null values in the entire dataframe
df.isnull().values.any()

In [None]:
# Print the number of entries in the dataframe
df.shape

In [None]:
# Create a dictionary with the number of publications per category
number_of_pub = {'Computer Science': df[df['categories'].str.contains(pat = 'cs.[A-Z]', regex = True)]['title'].size,
                 'Economics': df[df['categories'].str.contains(pat = 'econ.')]['title'].size, 
                 'Engineering': df[df['categories'].str.contains(pat = 'eess.')]['title'].size, 
                 'Math': df[df['categories'].str.contains(pat = 'math.[A-Z]')]['title'].size,
                 'Physics': df[df['categories'].str.contains(pat=('astro-|cond-|gr-qc|hep-|math-ph|nlin|nucl-|physics.|quant-'), regex = True)]['title'].size, 
                 "Quant-Biology": df[df['categories'].str.contains(pat = 'q-bio.')]['title'].size, 
                 'Quant-Finance': df[df['categories'].str.contains(pat = 'q-fin.')]['title'].size, 
                 'Statistics': df[df['categories'].str.contains(pat = 'stat.[A-Z]', regex = True)]['title'].size
                }

# Bar graph to display the number of publications per category
fields = list(number_of_pub.keys())
values = list(number_of_pub.values())

fig = plt.figure(figsize = (12, 6)) 
 
plt.bar(fields, values, color ='maroon',
        width = 0.4)

plt.ticklabel_format(axis='y', style='plain') 
plt.xlabel("Scientific Fields")
plt.ylabel("Number of Papers")
plt.savefig('pub_per_cat.pdf')  
plt.show()

In [None]:
# Create new DataFrames per category, restrict to publications after the year 1999, group by publication date and calculate a moving average of the last 3 months
# Computer Science
cs = df[df['categories'].str.contains(pat = 'cs.[A-Z]', regex = True)][['title', 'publication_date']]
cs = cs[cs['publication_date'] > dt(1999,12,31).date()]
cs = cs.groupby("publication_date")["title"].count()
cs = cs.rolling(window=120).mean()

# Physics
physics = df[df['categories'].str.contains(pat=('astro-|cond-|gr-qc|hep-|math-ph|nlin|nucl-|physics.|quant-'), regex = True)][['title', 'publication_date']]
physics = physics[physics['publication_date'] > dt(1999,12,31).date()]
physics = physics.groupby("publication_date")["title"].count()
physics = physics.rolling(window=120).mean()

# Mathematics
math = df[df['categories'].str.contains(pat = 'math.[A-Z]')][['title', 'publication_date']]
math = math[math['publication_date'] > dt(1999,12,31).date()]
math = math.groupby("publication_date")["title"].count()
math = math.rolling(window=120).mean()

# Economics
econ = df[df['categories'].str.contains(pat = 'econ.')][['title', 'publication_date']]
econ = econ[econ['publication_date'] > dt(1999,12,31).date()]
econ = econ.groupby("publication_date")["title"].count()
econ = econ.rolling(window=120).mean()

# Engineering
eess = df[df['categories'].str.contains(pat = 'eess.')][['title', 'publication_date']]
eess = eess[eess['publication_date'] > dt(1999,12,31).date()]
eess = eess.groupby("publication_date")["title"].count()
eess = eess.rolling(window=120).mean()

# Statistics
stat = df[df['categories'].str.contains(pat = 'stat.[A-Z]', regex = True)][['title', 'publication_date']]
stat = stat[stat['publication_date'] > dt(1999,12,31).date()]
stat = stat.groupby("publication_date")["title"].count()
stat = stat.rolling(window=120).mean()

# Quantitative Finance
q_fin = df[df['categories'].str.contains(pat = 'q-fin.')][['title', 'publication_date']]
q_fin = q_fin[q_fin['publication_date'] > dt(1999,12,31).date()]
q_fin = q_fin.groupby("publication_date")["title"].count()
q_fin = q_fin.rolling(window=120).mean()

# Quantitative Biology
q_bio = df[df['categories'].str.contains(pat = 'q-bio.')][['title', 'publication_date']]
q_bio = q_bio[q_bio['publication_date'] > dt(1999,12,31).date()]
q_bio = q_bio.groupby("publication_date")["title"].count()
q_bio = q_bio.rolling(window=120).mean()

# Display the publications as a time series starting in the year 2001
fig, axs = plt.subplots(4, 2,figsize=(15,15))

# Format the x-axis
x_ticks = [dt(year, 1, 1) for year in range(2001, 2022,2)]
date_form = DateFormatter("%Y")

for ax in axs:
    for plot in ax:
        plot.xaxis.set_major_formatter(date_form)
        plot.set_xticks(x_ticks)

# Format the x-axis of Engineering and Economy seperately
x_ticks = [dt(year, 1, 1) for year in range(2016, 2022,1)]
axs[2,0].set_xticks(x_ticks)
axs[2,1].set_xticks(x_ticks)

axs[0, 0].plot(cs, color='maroon')
axs[0, 0].set_title('Computer Science')
axs[0, 1].plot(physics, color='maroon')
axs[0, 1].set_title('Physics')
axs[1, 0].plot(math, color='maroon')
axs[1, 0].set_title('Mathematics')
axs[1, 1].plot(stat, color='maroon')
axs[1, 1].set_title('Statistics')
axs[2, 0].plot(econ, color='maroon')
axs[2, 0].set_title('Economy')
axs[2, 1].plot(eess, color='maroon')
axs[2, 1].set_title('Electrical Engineering and Systems Science')
axs[3, 0].plot(q_bio, color='maroon')
axs[3, 0].set_title('Quantitative Biology')
axs[3, 1].plot(q_fin, color='maroon')
axs[3, 1].set_title('Quantitative Finance')

plt.savefig('pub_trend.pdf')
plt.show()

In [None]:
# Create a Computer Science and Physics DataFrame for Topic Modelling 
cs = df[df['categories'].str.contains(pat = 'cs.[A-Z]', regex = True)][['abstract', 'categories', 'publication_date']]
cs = cs[cs['publication_date'] > dt(2019,12,31).date()]

physics = df[df['categories'].str.contains(pat=('astro-|cond-|gr-qc|hep-|math-ph|nlin|nucl-|physics.|quant-'), regex = True)][['abstract', 'categories','publication_date']]
physics = physics[physics['publication_date'] > dt(2019,12,31).date()]

In [None]:
# Extract the nouns from the abstracts, remove entities with special characters and convert them to lowercase
def process_data(data):
    
    extractor = FastNPExtractor()
    # Extract the nouns from the abstracts
    abstract_nouns = [extractor.extract(abstract) for abstract in data]

    # Lowercase and filter special characters for the extracted compound nouns 
    special_characters = ['\\', '\'', '\"', '}', '{', '§', '$', '%', '/', '?', '[', ']', '(', ')', '.', '^']
    normalized_abstract_nouns = []
    for abstract in abstract_nouns:
        normalized_abstract_nouns.append([noun.lower() for noun in abstract if not any(character in noun for character in special_characters)])

    return [" ".join(abstract) for abstract in normalized_abstract_nouns]

#cs['nouns'] = process_data(cs['abstract'])
#physics['nouns'] = process_data(physics['abstract'])

# Dump the generated features as a csv file
#cs['nouns'].to_csv('cs_nouns.csv')
#physics['nouns'].to_csv('physics_nouns.csv') 

In [None]:
# Alternatively load the already generated DataFrame from a csv file
def read_nouns(filename):
    
    nouns = []
    with open(filename) as file:
        csv_reader = csv.reader(file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                # Skip the first line
                line_count += 1
            else:
                nouns.append(row[1])
    return nouns
            
cs['nouns'] = read_nouns('../input/dlbdsmlusl01/cs_nouns.csv')
physics['nouns'] = read_nouns('../input/dlbdsmlusl01/physics_nouns.csv')

In [None]:
# Vectorize the data with the TF-IDF method
def vectorize_data(data):
    vectorizer = TfidfVectorizer(lowercase=False, norm=False, stop_words='english')
    return vectorizer.fit_transform(data), vectorizer.get_feature_names()

cs_vec, cs_feature_names = vectorize_data(cs['nouns'])
physics_vec, physics_feature_names = vectorize_data(physics['nouns'])

In [None]:
# Use LDA for topic modelling
def topic_modelling(data):
    lda = LatentDirichletAllocation(n_components = 20)
    lda.fit(data)
    return lda

#cs_lda = topic_modelling(cs_vec)
#physics_lda = topic_modelling(physics_vec)

# Save the models
#pickle.dump(cs_lda, open('cs_lda.pk', 'wb'))
#pickle.dump(physics_lda, open('physics_lda.pk', 'wb'))

In [None]:
# Alternatively load the saved models
cs_lda = pickle.load(open('../input/dlbdsmlusl01/cs_lda.pk', 'rb'))
physics_lda = pickle.load(open('../input/dlbdsmlusl01/physics_lda.pk', 'rb'))

In [None]:
# Plot the top words of each topic
def plot_top_words(model, feature_names, n_top_words, title, filename):
    fig, axes = plt.subplots(5, 4, figsize=(15, 20), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7, color ='maroon')
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.savefig(filename)
    plt.show()
    
plot_top_words(cs_lda, cs_feature_names, 5, 'Computer Science Topics', 'cs_topics.pdf')

In [None]:
plot_top_words(physics_lda, physics_feature_names, 5, 'Physics Topics', 'physics_topics.pdf')

In [None]:
# Create a Wordcloud with the top categories
def create_wordcloud(data, filename):
    wordcloud = WordCloud(max_font_size=70, max_words=6, background_color="azure").generate(" ".join(data))
    wordcloud.to_file(filename)
    plt.imshow(wordcloud,interpolation="bilinear")
    plt.axis('off')
    plt.show()
    
create_wordcloud(cs['categories'], 'cs_wordcloud.png')

In [None]:
create_wordcloud(physics['categories'],'physics_wordcloud.png' )