In [1]:
%run -i 'setup.py'

In [2]:
import sqlite3
import pandas as pd
# Create your connection.
cnx = sqlite3.connect('test.db')
#load database to Pandas Data frame 'df'
df = pd.read_sql_query("SELECT * FROM arxiv_cs_publications", cnx)

In [3]:
#Database Schema
#+---------------------+       
#|arxiv_cs_publications|       
#-----------------------       
#|publication_id(pk)   |       
#|title                |       
#|description          |       
#|subject              |       
#|version              |       
#|created_date         |       
#+---------------------+

#Article Link = "https://arxiv.org/abs/" + version
#Article PDF  = "https://arxiv.org/pdf/" + version

In [4]:
df.count()

entry_id               275
title                  275
description            275
subject                275
publication_id         275
publication_version    275
created_date           275
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 7 columns):
entry_id               275 non-null int64
title                  275 non-null object
description            275 non-null object
subject                275 non-null object
publication_id         275 non-null object
publication_version    275 non-null object
created_date           275 non-null object
dtypes: int64(1), object(6)
memory usage: 15.1+ KB


In [6]:
df.head()

Unnamed: 0,entry_id,title,description,subject,publication_id,publication_version,created_date
0,1,Efficient Estimation of Heat Kernel PageRank f...,"Given an undirected graph G and a seed node s,...",cs.SI,1904.02707,1,2019-04-08 18:43:47.575287
1,2,Bounties in Open Source Development on GitHub:...,Due to the voluntary nature of open source sof...,cs.SE,1904.02724,1,2019-04-08 18:43:47.601873
2,3,Towards Specifying Symbolic Computation.,Many interesting and useful symbolic computati...,cs.LO,1904.02729,1,2019-04-08 18:43:47.621823
3,4,Neural Models of the Psychosemantics of `Most'.,How are the meanings of linguistic expressions...,cs.CL,1904.02734,1,2019-04-08 18:43:47.652527
4,5,On Topological and Metrical Properties of Stab...,"In this paper, we discuss various topological ...",cs.SY,1904.02737,1,2019-04-08 18:43:47.670984


In [9]:
#Check for duplicate versions of a publication
df_checkV = pd.read_sql_query("""SELECT count (publication_id),publication_id,title, publication_version
                              FROM arxiv_cs_publications
                              group by publication_id
                              having count (publication_id)	 > 1""", cnx)

In [10]:
df_checkV.head()

Unnamed: 0,count (publication_id),publication_id,title,publication_version


In [116]:
#count the number of words in each publication description 
df['total_words'] = df['description'].str.split().str.len()

In [32]:
#Get the number of stop words per description 
#In natural language processing, useless words (data), are referred to as stop words. 
#Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”)
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')

df['stopwords'] = df['description'].apply(lambda x: len([x for x in x.split() if x in stop]))

df['words_minus_stopwords'] = df["total_words"] - df["stopwords"]

df[['description','stopwords','total_words','words_minus_stopwords']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,description,stopwords,total_words,words_minus_stopwords
0,"Given an undirected graph G and a seed node s,...",83,239,156
1,Due to the voluntary nature of open source sof...,108,249,141
2,Many interesting and useful symbolic computati...,27,70,43
3,How are the meanings of linguistic expressions...,48,146,98
4,"In this paper, we discuss various topological ...",34,91,57


In [124]:
#All words to lower case
df['description'] = df['description'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['description'].head()

0    mobile robots that manipulate their environmen...
1    multi-task learning, as it is understood nowad...
2    well established libraries typically have api ...
3    an equiangular tight frame (etf) is a sequence...
4    we present a coarse-to-fine approach based sem...
Name: description, dtype: object

In [125]:
#Remove punctuation
df['description'] = df['description'].str.replace('[^\w\s]','')
df['description'].head()

0    mobile robots that manipulate their environmen...
1    multitask learning as it is understood nowaday...
2    well established libraries typically have api ...
3    an equiangular tight frame etf is a sequence o...
4    we present a coarsetofine approach based semia...
Name: description, dtype: object

In [126]:
#Remove Stop words
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['description'].head()


0    mobile robots manipulate environments require ...
1    multitask learning understood nowadays consist...
2    well established libraries typically api docum...
3    equiangular tight frame etf sequence unitnorm ...
4    present coarsetofine approach based semiautono...
Name: description, dtype: object

In [127]:
#Top Ten most Frequent Words
freq = pd.Series(' '.join(df['description']).split()).value_counts()[:10]
print(freq)
#Remove top ten most frequent words
freq = list(freq.index)
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['description'].head()

data           176
learning       160
model          144
paper          128
performance    125
proposed       112
using          110
show           107
network        106
problem        102
dtype: int64


0    mobile robots manipulate environments require ...
1    multitask understood nowadays consists one sin...
2    well established libraries typically api docum...
3    equiangular tight frame etf sequence unitnorm ...
4    present coarsetofine approach based semiautono...
Name: description, dtype: object

In [130]:
#Tokenize 
df['description'].str.split()

0      [mobile, robots, manipulate, environments, req...
1      [multitask, understood, nowadays, consists, on...
2      [well, established, libraries, typically, api,...
3      [equiangular, tight, frame, etf, sequence, uni...
4      [present, coarsetofine, approach, based, semia...
5      [image, understanding, relies, heavily, accura...
6      [consuming, news, social, media, becoming, inc...
7      [new, deep, based, dense, monocular, slam, met...
8      [polar, codes, gained, extensive, attention, p...
9      [propose, use, agent, based, models, abms, ins...
10     [structure, time, series, particular, cyclosta...
11     [propose, unified, framework, multiperson, pos...
12     [well, estimate, probability, classification, ...
13     [introduce, novel, observability, continuum, e...
14     [giving, provable, guarantees, neural, network...
15     [present, skelneton, 2019, challenge, deep, ge...
16     [consider, em, mixed, linear, regression, mlr,...
17     [work, tackles, fuzzy, j