# Import libraries

In [27]:
import gzip
import json

import pandas as pd
#text preprocessing
import re
import spacy
import numpy as np

import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
import string
#vectorizers+lda
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load data

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
with gzip.open("/content/drive/MyDrive/TFM/papers-with-abstracts.json.gz", "r") as f:
   data = f.read()
   j = json.loads (data.decode('utf-8'))

df = pd.DataFrame(j)

In [21]:
print(df.columns.values)
#df.drop(columns=['paper_url','arxiv_id', 'url_abs', 'url_pdf',
#                 'proceeding', 'authors', 'tasks', 'date'])

['paper_url' 'arxiv_id' 'title' 'abstract' 'url_abs' 'url_pdf'
 'proceeding' 'authors' 'tasks' 'date' 'methods']


# Preprocess

In [22]:
#Take abstracts
df.dropna(axis=0, subset=['abstract'],inplace=True)
papers = df['abstract'].sample(100)
print(papers.head())

54770     Multi-omic data provides multiple views of the...
200100    The bulk of computational approaches for model...
207576    Deep learning has achieved remarkable success ...
83157                                                      
83725                                                      
Name: abstract, dtype: object


In [23]:
# Convert each article to all lower case
papers = [abstract.lower() for abstract in papers]
# Strip all punctuation from each article
table = str.maketrans('', '', string.punctuation)
papers = [abstract.translate(table) for abstract in papers]
#Remove new line characters 
papers = [abstract.rstrip() for abstract in papers]
#Remove stop words 
stop = set(stopwords.words('english'))
papers = [[word for word in abstract.split() if word not in stop] for abstract in papers]
#Lemmatization
papers = [" ".join([lemma.lemmatize(word) for word in abstract]) for abstract in papers]

# Print the first article as a running example
print(papers[0])

multiomic data provides multiple view patient integrative analysis multiomic data crucial elucidate molecular underpinning disease etiology however multiomic data big p small n problem number feature large number sample small challenging train complicated machine learning model multiomic data alone make generalize well propose framework termed multiview factorization autoencoder network constraint integrate multiomic data domain knowledge biological interaction network framework employ deep representation learning learn feature embeddings patient embeddings simultaneously enabling u integrate feature interaction network patient view similarity network constraint training objective whole framework endtoend differentiable applied approach tcga pancancer dataset achieved satisfactory result predict disease progressionfree interval pfi patient overall survival o event code made publicly available


In [24]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

# LDA


In [25]:
#LDA
#n_components = number topics
n_components = 10
n_features = 1000
n_top_words = 20
#max_df = ignore terms with frequency higher
#min_df = ignore terms with frequency lower
#max_features = will only consider the given features ordered by term frequency across the entire corpus???
#ngram_range =  a tuple which represents the lower and upper boundary for n-gram extractions
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, ngram_range=(1, 2))
tf = tf_vectorizer.fit_transform(papers)
# Creating vocabulary array which will represent all the corpus 
tf_feature_names = tf_vectorizer.get_feature_names_out()

lda = LatentDirichletAllocation(n_components=n_components, random_state=1).fit(tf) #maxiter?
print(lda.perplexity)
print(lda.score)

display_topics(lda, tf_feature_names, n_top_words)

<bound method LatentDirichletAllocation.perplexity of LatentDirichletAllocation(random_state=1)>
<bound method LatentDirichletAllocation.score of LatentDirichletAllocation(random_state=1)>
Topic 0:
model learning result datasets data feature using bias method object sample edge specifically paper proposed research population training dataset sensor
Topic 1:
algorithm learning data proposed topology model using new medical network dynamic structure adaptation performance image paper result complexity distribution system
Topic 2:
method algorithm data problem explanation complex distribution optimization point unit application model present learning local complete also graph two proposed
Topic 3:
phase model pattern datasets text image design language measurement algorithm well reconstruction dataset data experimental training task result use present
Topic 4:
task method paper project challenge data tracking result learning system simulated swarm proposed present technique array dl softw

In [28]:
methods = pd.DataFrame(df['methods'])
methods = methods.methods.apply(lambda y: np.nan if len(y)==0 else y)
methods.dropna(axis=0, inplace=True)
methods

1         [{'name': 'SVM', 'full_name': 'Support Vector ...
8         [{'name': '3D Convolution', 'full_name': '3D C...
14        [{'name': 'Convolution', 'full_name': 'Convolu...
15        [{'name': 'Average Pooling', 'full_name': 'Ave...
16        [{'name': 'Affine Coupling', 'full_name': 'Aff...
                                ...                        
277424    [{'name': 'Causal Inference', 'full_name': 'Ca...
277427    [{'name': 'Dropout', 'full_name': 'Dropout', '...
277430    [{'name': 'Convolution', 'full_name': 'Convolu...
277437    [{'name': 'Adam', 'full_name': 'Adam', 'descri...
277442    [{'name': 'Gaussian Process', 'full_name': 'Ga...
Name: methods, Length: 46205, dtype: object

In [29]:
tasks = np.unique(df['tasks']).tolist()
tasks = [task for task in tasks if bool(task)]
def unique_values_in_list_of_lists(lst):
    result = set(x for l in lst for x in l)
    return list(result)
unique_values_in_list_of_lists(tasks)

['',
 'Multiple Affordance Detection',
 'Image Similarity Detection',
 'Serial Style Transfer',
 'Spacecraft Pose Estimation',
 'Fraud Detection',
 'Setting-1/4',
 'NetHack',
 'Image Defocus Deblurring',
 'Semantic Text Matching',
 'Spike Sorting',
 'Optic Cup Segmentation',
 'Fast Vehicle Detection',
 'Multi-Frame Super-Resolution',
 'Constrained Diffeomorphic Image Registration',
 'Multi-Document Summarization',
 'Multi-Grained Named Entity Recognition',
 'Learning Representation Of Multi-View Data',
 'Dimensionality Reduction',
 'Twitter Sentiment Analysis',
 'Semi-supervised Anomaly Detection',
 'Sentence Summarization',
 '3D Object Recognition',
 'Weakly-supervised panoptic segmentation',
 'UCCA Parsing',
 'Entity Embeddings',
 'Safety Perception Recognition',
 'LOG PARSING',
 'Point Cloud Super Resolution',
 'Multi-Choice MRC',
 'Conversational Response Generation',
 'Referring Expression Segmentation',
 'Transparent Object Detection',
 'Robot Navigation',
 'Overlapped 19-1',
 'P