In [1]:
import os
import json
import numpy as np
from datetime import datetime
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
def is_noun(pos):
    if pos in noun_tags:
        return True
    else:
        return False

In [3]:
def extract_nouns(text):
    nouns = []
    
    sentences = sent_tokenize(text)
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        sentence_nouns = [word.lower() for (word, pos) in pos_tag(tokens) if is_noun(pos)] 
        nouns += sentence_nouns
    return ' '.join(nouns)

In [4]:
def get_top_level_categories(categories):
    specific_categories = categories.split()
    top_level_categories = set([category.split('.')[0] for category in specific_categories])
    
    return top_level_categories

In [5]:
# Training set: 2016
# Test set: 2017

categories = set()

train_X = []
train_Y = []
with open(os.path.join('data-by-year', '2016.json'), 'r') as f:
    for line in f:
        row = json.loads(line)['arXiv']
        year = datetime.strptime(row['created'],"%Y-%m-%d").year
        
        row_categories = get_top_level_categories(row['categories'])
        categories = categories.union(row_categories)

        train_X.append(extract_nouns(row['title'] + '\n ' + row['abstract']))
        train_Y.append(row_categories)

test_X = []
test_Y = []    
with open(os.path.join('data-by-year', '2017.json'), 'r') as f:
    for line in f:
        row = json.loads(line)['arXiv']
        year = datetime.strptime(row['created'],"%Y-%m-%d").year
        
        row_categories = get_top_level_categories(row['categories'])
        categories = categories.union(row_categories)

        test_X.append(extract_nouns(row['title'] + '\n ' + row['abstract']))
        test_Y.append(row_categories)

print(len(train_X))
print(len(test_X))
print(len(train_Y))
print(len(test_Y))

113436
123781
113436
123781


In [16]:
from collections import defaultdict

word_counts_per_category = {}
word_doc_counts_per_category = {}

for i in range(len(train_Y)):
    words = train_X[i].split()
    unique_words = set(words)
    for category in train_Y[i]:
        if category not in word_counts_per_category:
            word_counts_per_category[category] = defaultdict(int)
            word_doc_counts_per_category[category] = defaultdict(int)
        for word in words:
            word_counts_per_category[category][word] += 1
        for word in unique_words:
            word_doc_counts_per_category[category][word] += 1

In [33]:
neg = set()
for category, word_freq in word_doc_counts_per_category.items():
    if category == 'physics':
        pos = set(word_freq.keys())
    else:
        neg = neg.union(word_freq.keys())
print(pos - neg)
print(word_doc_counts_per_category['econ'])


{'time-interleaved', 'k^+\\rightarrow', 'db/m', 'high-harmonic', 'rekarks', 'non-air', 'eyring', 'servomotor', 'nonclustered', 'pafe', 'solvionic', '\\alpha_l\\', 'ze-rey', 'ta_s^', 'tremor-location', 'hir', 'spta', 'arxiv:1508.03219', 'microtomograph', 'factin', 'extensions/corrections', '1-hexene', 'meccanica', '-7p~^', 'croissant', 'nicr', 'oncological', '-88', 'm_0g', 'fene-cr', 'ci\\^ensa\\c', 'polyolefin', 'hadisst', 'catsw', 'kawata', 'cole\\c', 'experiments5,6', 'sub-critical', 'soundboard', 'photopolymerization', 'r-asp', 'm3.0', 'fast-burning', 'peak/s', 'contribucion', 'double-distribution-function', 'sphps', 'apple-ii', 'osh-pt', 'td-casscf', 'beron-vera', 'analizamos', 'esrrs', 's/\\rd', 'conceptos', 'q=7500', 's_z=\\pm', 'teslatransformatoren', 'beale', 'dielectric-like', 'edel', 'astrologer', '/ingaas', 'beading', 'multi-penning-trap', 'verdet', 'circular-plane-jet', 't-506', 'pi-pi', 'spectral-domain', 'vpt2', 'c\\hbar', 'rayleigh-scattering', 'micro-mesh', 'gluon-', 'c

In [19]:
word_doc_counts_per_category

{'cs': defaultdict(int,
             {'people': 510,
              'average': 307,
              'digital': 122,
              'size': 1622,
              'quality': 1312,
              'creativity': 24,
              'seed': 64,
              'respect': 866,
              'matters': 30,
              'images': 1601,
              'approach': 5470,
              'reasons': 175,
              'approaches': 2005,
              'things': 218,
              'problems': 2804,
              'ones': 524,
              'substrate': 23,
              'puzzles': 16,
              'results': 7826,
              'games': 439,
              'work': 3815,
              'clearer': 9,
              'synaptic': 5,
              'findings': 438,
              'process': 2076,
              'output': 882,
              'dsns': 2,
              'chess': 7,
              'neural': 1182,
              'photographs': 28,
              'sequences': 575,
              'players': 245,
              'barter': 5,

In [20]:
pos

{'%',
 '\\cin',
 'abf',
 'ability',
 'accuracy',
 'achievability',
 'actions',
 'adaptation',
 'addition',
 'adversary',
 'aid',
 'alerts',
 'algebraic',
 'algorithm',
 'alice',
 'am',
 'analysis',
 'aperiodicity',
 'application',
 'applications',
 'approximate',
 'archiving',
 'array',
 'artificial',
 'atoms',
 'avoidance',
 'awgn',
 'beamformer',
 'beamformers',
 'beampattern',
 'behavior',
 'bit-error',
 'bits',
 'bob',
 'border',
 'bounds',
 'camera',
 'cameras',
 'cars',
 'case',
 'channel',
 'channels',
 'circle',
 'classes',
 'classification',
 'classifications',
 'closest',
 'codes',
 'collaborate',
 'communication',
 'component',
 'computation',
 'concept',
 'consideration',
 'constraint',
 'constructions',
 'convnets',
 'convolutional',
 'coordination',
 'correlates',
 'costs',
 'covariance',
 'covert',
 'cumbersome',
 'data',
 'dataset',
 'deep',
 'density',
 'detecting',
 'detection',
 'detector',
 'deviation',
 'diagonal',
 'distributed',
 'distribution',
 'dnns',
 'ecm',


In [35]:
# Training set: 2016
with open('2016.txt', 'w') as o:
    for text in train_X:
        o.write(text + '\n')