## Get abstract

In [1]:
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer

# Clean up for LDA
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

# Import project utils
import utils

In [2]:
#check dataset
papers = pd.read_csv('../data/papers.csv')
print('Size of data: ', papers.shape)
papers.head()

Size of data:  (7241, 7)


Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [10]:
sum(papers['abstract'] == "Abstract Missing")

3317

## Data Cleaning
- Might need to consider removing latex symbol and numbers

In [3]:
papers = utils.preprocessing(papers)
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,An efficient method of self-organizing associa...,767 SELF-ORGANIZATION OF ASSOCIATIVE DATABASE...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,A single cell theory for the development of se...,683 A MEAN FIELD THEORY OF LAYER IV OF VISUAL...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,In modeling studies or memory based on neural...,394 STORING COVARIANCE BY THE ASSOCIATIVE LON...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,"If data collection is costly, there is much to...",Bayesian Query Construction for Neural Network...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Learning of continuous valued functions using ...,"Neural Network Ensembles, Cross Validation, an..."


In [6]:
missing_inx = np.where(papers['abstract'].isna())[0]
missing_inx

array([  80,   96,  102,  133,  163,  191,  220,  253,  284,  383,  644,
        673,  732,  765,  786,  808,  917,  946,  972, 1164, 1251, 1384,
       1428, 1443, 1448, 1528, 1537, 1644, 1649, 1683, 1760, 1915, 2202,
       2206, 2612, 2689, 3067, 3422, 3457, 3754, 4401, 4474, 4640, 5058,
       5325, 5412, 5722, 5814, 6013, 6107, 6499, 6763, 6853, 6952, 6953,
       6964, 6974, 6982, 7021, 7023, 7024, 7025, 7033, 7040, 7056, 7057,
       7066, 7074, 7102, 7105, 7123, 7141, 7146, 7178])

In [7]:
papers['year'][missing_inx].value_counts()

1993    15
1987    13
1988    12
1989     5
1995     5
2016     3
2002     2
1994     2
2017     2
2003     2
2013     2
2001     1
2006     1
1998     1
2007     1
2008     1
2010     1
1992     1
1991     1
1990     1
2015     1
2000     1
Name: year, dtype: int64

In [25]:
print("After cleaning, only", round(len(missing_inx)/papers.shape[0] * 100, 2), "% of the data do not have abstract and from above, we can most of them are older NeuroIPS papers where the formatting of the paper does not contain keywords for us to identify the location of the abstract.")
print("We recovered 3243 abstracts out of 3317 articles.")

After cleaning, only 1.02 % of the data do not have abstract and from above, we can most of them are older NeuroIPS papers where the formatting of the paper does not contain keywords for us to identify the location of the abstract.
We recovered 3243 abstracts out of 3317 articles.
