In [1]:
%run -i 'setup.py'

In [38]:
import sqlite3
import pandas as pd
# Create your connection.
cnx = sqlite3.connect('test.db')
#load database to Pandas Data frame 'df'
df = pd.read_sql_query("SELECT * FROM arxiv_cs_publications", cnx)

In [28]:
#Database Schema
#+---------------------+       
#|arxiv_cs_publications|       
#-----------------------       
#|publication_id(pk)   |       
#|title                |       
#|description          |       
#|subject              |       
#|version              |       
#|created_date         |       
#+---------------------+

#Article Link = "https://arxiv.org/abs/" + version
#Article PDF  = "https://arxiv.org/pdf/" + version

In [31]:
df.count()

publication_id    425
title             425
description       425
subject           425
version           425
created_date      425
dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 6 columns):
publication_id    425 non-null int64
title             425 non-null object
description       425 non-null object
subject           425 non-null object
version           425 non-null object
created_date      425 non-null object
dtypes: int64(1), object(5)
memory usage: 20.0+ KB


In [30]:
df.head()

Unnamed: 0,publication_id,title,description,subject,version,created_date
0,1,"Efficient energy, cost reduction, and QoS base...",Recent developments and widespread in wireless...,cs.NI,1903.09636v1,2019-03-26 18:32:52.456428
1,2,Understanding Childhood Vulnerability in The C...,Understanding the community conditions that be...,cs.CY,1903.09639v1,2019-03-26 18:32:52.480675
2,3,Jet grooming through reinforcement learning.,We introduce a novel implementation of a reinf...,hep-ph,1903.09644v1,2019-03-26 18:32:52.505521
3,4,Anti-Turing Machine.,The invention of CPU-centric computing paradig...,cs.OH,1903.09653v1,2019-03-26 18:32:52.526601
4,5,A Hybrid Approach to Persistent Coverage in St...,This paper considers the persistent coverage o...,cs.MA,1903.09658v1,2019-03-26 18:32:52.545617


In [122]:
#count the number of words in each publication description 
df['total_words'] = df['description'].str.split().str.len()

In [123]:
#Get the number of stop words per description 
#In natural language processing, useless words (data), are referred to as stop words. 
#Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”)
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')

df['stopwords'] = df['description'].apply(lambda x: len([x for x in x.split() if x in stop]))

df['words_minus_stopwords'] = df["total_words"] - df["stopwords"]

df[['description','stopwords','total_words','words_minus_stopwords']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,description,stopwords,total_words,words_minus_stopwords
0,Mobile robots that manipulate their environmen...,44,136,92
1,"Multi-task learning, as it is understood nowad...",65,180,115
2,Well established libraries typically have API ...,106,288,182
3,An equiangular tight frame (ETF) is a sequence...,83,213,130
4,We present a coarse-to-fine approach based sem...,45,158,113


In [124]:
#All words to lower case
df['description'] = df['description'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['description'].head()

0    mobile robots that manipulate their environmen...
1    multi-task learning, as it is understood nowad...
2    well established libraries typically have api ...
3    an equiangular tight frame (etf) is a sequence...
4    we present a coarse-to-fine approach based sem...
Name: description, dtype: object

In [125]:
#Remove punctuation
df['description'] = df['description'].str.replace('[^\w\s]','')
df['description'].head()

0    mobile robots that manipulate their environmen...
1    multitask learning as it is understood nowaday...
2    well established libraries typically have api ...
3    an equiangular tight frame etf is a sequence o...
4    we present a coarsetofine approach based semia...
Name: description, dtype: object

In [126]:
#Remove Stop words
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['description'].head()


0    mobile robots manipulate environments require ...
1    multitask learning understood nowadays consist...
2    well established libraries typically api docum...
3    equiangular tight frame etf sequence unitnorm ...
4    present coarsetofine approach based semiautono...
Name: description, dtype: object

In [127]:
#Top Ten most Frequent Words
freq = pd.Series(' '.join(df['description']).split()).value_counts()[:10]
print(freq)
#Remove top ten most frequent words
freq = list(freq.index)
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['description'].head()

data           176
learning       160
model          144
paper          128
performance    125
proposed       112
using          110
show           107
network        106
problem        102
dtype: int64


0    mobile robots manipulate environments require ...
1    multitask understood nowadays consists one sin...
2    well established libraries typically api docum...
3    equiangular tight frame etf sequence unitnorm ...
4    present coarsetofine approach based semiautono...
Name: description, dtype: object

In [130]:
#Tokenize 
df['description'].str.split()

0      [mobile, robots, manipulate, environments, req...
1      [multitask, understood, nowadays, consists, on...
2      [well, established, libraries, typically, api,...
3      [equiangular, tight, frame, etf, sequence, uni...
4      [present, coarsetofine, approach, based, semia...
5      [image, understanding, relies, heavily, accura...
6      [consuming, news, social, media, becoming, inc...
7      [new, deep, based, dense, monocular, slam, met...
8      [polar, codes, gained, extensive, attention, p...
9      [propose, use, agent, based, models, abms, ins...
10     [structure, time, series, particular, cyclosta...
11     [propose, unified, framework, multiperson, pos...
12     [well, estimate, probability, classification, ...
13     [introduce, novel, observability, continuum, e...
14     [giving, provable, guarantees, neural, network...
15     [present, skelneton, 2019, challenge, deep, ge...
16     [consider, em, mixed, linear, regression, mlr,...
17     [work, tackles, fuzzy, j