In [1]:
import pandas as pd
import pickle
import numpy as np
import string
import operator
import re
import os
import sys
import codecs
from collections import Counter
import csv
import matplotlib.pylab as plt
from matplotlib.pyplot import figure
plt.rcParams.update({'font.size': 7})

import wordcloud
from wordcloud import WordCloud

import nltk
from nltk import ngrams
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import *
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import FreqDist
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

import gensim
from gensim.models import Word2Vec
from gensim import models, corpora
from gensim.models import TfidfModel

import sklearn
from sklearn.decomposition import PCA
from sklearn import feature_extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

In [2]:
def clean_up_text(text):
    tokens = split_text_to_tokens(text)
    tokens = clean_up_tokens(tokens)
    processed_text = " ".join(tokens)
    return processed_text

def split_text_to_tokens(text):
    return nltk.word_tokenize(text)

def clean_up_tokens(tokens):
    tokens = remove_punctuation_from_tokens(tokens)
    tokens = remove_non_alphabetic_tokens(tokens)
    tokens = set_tokens_to_lowercase(tokens)
    tokens = remove_stopwords_from_tokens(tokens)
    tokens = remove_small_words_from_tokens(tokens)
    tokens = lemmatize_tokens(tokens)
    tokens = remove_unimportant_words_from_tokens(tokens)
    return tokens

def remove_punctuation_from_tokens(tokens):
    translation_table = str.maketrans({key: None for key in string.punctuation})
    text_without_punctuations = []
    for each_token in tokens:
        text_without_punctuations.append(each_token.translate(translation_table))
    return text_without_punctuations

def remove_non_alphabetic_tokens(tokens):
    alphabetic_tokens = []
    for token in tokens:
        if token.isalpha():
            alphabetic_tokens.append(token)
    return alphabetic_tokens

def set_tokens_to_lowercase(tokens):
    lowercase_tokens = []
    return [each_token.lower() for each_token in tokens]

def remove_stopwords_from_tokens(tokens):
    stop_words = set(stopwords.words("english"))
    return [each_token for each_token in tokens if each_token not in stop_words]

def remove_small_words_from_tokens(tokens):
    return [each_token for each_token in tokens if len(each_token) > 2]

def remove_unimportant_words_from_tokens(tokens):
    lemmatized_tokens = lemmatize_tokens(tokens)
    tokens_with_part_of_speech_tags = nltk.pos_tag(lemmatized_tokens)
    cleared_token_list = [each_token[0] for each_token in tokens_with_part_of_speech_tags if each_token[1] in ["JJ", "JJR", "JJS", "NN", "NNS", "NNP", "NNPS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]]
    # JJ (adjective), NN (noun), NNP (proper noun), RB (adverb), VB (verb) 
    return cleared_token_list

def lemmatize_tokens(tokens):
    wordnet_lemmatizer = WordNetLemmatizer()
    parts_of_speech = [wordnet.ADJ, wordnet.ADJ_SAT, wordnet.ADV, wordnet.NOUN, wordnet.VERB]
    lemmatized_tokens = tokens
    for each_part_of_speech in parts_of_speech:
        lemmatized_tokens = [wordnet_lemmatizer.lemmatize(each_token, pos=each_part_of_speech) for each_token in lemmatized_tokens]
    return lemmatized_tokens

def preprocess(pstr1):
     s=split_text_to_tokens(pstr1)
     s=remove_non_alphabetic_tokens(s)
     s=remove_punctuation_from_tokens(s)
     s=set_tokens_to_lowercase(s)
     return s

In [3]:
hep_df = pd.read_pickle("./Hep_Dataset.pkl")
hep_df.head()

Unnamed: 0,Text,Title,Abstract,Astrophysics,Experiment-HEP,Gravitation and Cosmology,Phenomenology-HEP,Theory-HEP
0,[Dark Matter and Gauge Coupling Unification in...,Dark Matter and Gauge Coupling Unification in ...,WIMP dark matter and gauge coupling unificatio...,0,0,0,1,0
1,"[(B-L) symmetry vs. neutrino seesaw, We comput...",(B-L) symmetry vs. neutrino seesaw,We compute the effective coupling of the Major...,0,0,0,1,0
2,[Effective matrix models for deconfinement in ...,Effective matrix models for deconfinement in S...,I present a simple matrix model for the deconf...,0,0,0,1,0
3,"[Electroweak Results from HERA, Neutral and ch...",Electroweak Results from HERA,Neutral and charged current deep inelastic ep ...,0,1,0,0,0
4,[Strange hadrons at intermediate and high tran...,Strange hadrons at intermediate and high trans...,The PHENIX experiment provides excellent capab...,0,1,0,0,0


In [4]:
hep_df.Abstract = hep_df.Abstract.apply(clean_up_text)
hep_df['Theory'] = hep_df['Gravitation and Cosmology'] | hep_df['Theory-HEP']
hep_df

Unnamed: 0,Text,Title,Abstract,Astrophysics,Experiment-HEP,Gravitation and Cosmology,Phenomenology-HEP,Theory-HEP,Theory
0,[Dark Matter and Gauge Coupling Unification in...,Dark Matter and Gauge Coupling Unification in ...,wimp dark matter gauge couple unification cons...,0,0,0,1,0,0
1,"[(B-L) symmetry vs. neutrino seesaw, We comput...",(B-L) symmetry vs. neutrino seesaw,compute effective couple majoron boson evaluat...,0,0,0,1,0,0
2,[Effective matrix models for deconfinement in ...,Effective matrix models for deconfinement in S...,present simple matrix model deconfined phase t...,0,0,0,1,0,0
3,"[Electroweak Results from HERA, Neutral and ch...",Electroweak Results from HERA,neutral charge current deep inelastic scatter ...,0,1,0,0,0,0
4,[Strange hadrons at intermediate and high tran...,Strange hadrons at intermediate and high trans...,phenix experiment provide excellent capability...,0,1,0,0,0,0
5,[Exploring cold nuclear matter effects in d + ...,Exploring cold nuclear matter effects in d + A...,protonnucleus collision use investigate cold n...,0,1,0,0,0,0
6,"[Gluonic Effects on g-2: Holographic View, We ...",Gluonic Effects on g-2: Holographic View,study gluonic effect gluon condensation effect...,0,0,0,1,1,1
7,[Perturbative QCD correlations in multi-parton...,Perturbative QCD correlations in multi-parton ...,examine role play double parton interaction pa...,0,0,0,1,0,0
8,"[Holographic entanglement plateaux, We conside...",Holographic entanglement plateaux,consider entanglement entropy holographic fiel...,0,0,1,0,1,1
9,[Rotating Away Proton Decay in Flipped Unifica...,Rotating Away Proton Decay in Flipped Unification,show simple extension fermion sector flip mode...,0,0,0,1,0,0


In [5]:
vectorizer = TfidfVectorizer()
vectorised = vectorizer.fit_transform(hep_df['Abstract'])
tdm = pd.DataFrame(vectorised.toarray(), columns = vectorizer.get_feature_names())

In [6]:
tdm.shape

(1000, 6288)

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(tdm[:990], hep_df.Theory[:990])



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
y_pred = list(clf.predict(tdm[990:]))
y_pred

[0, 1, 0, 1, 0, 1, 1, 0, 0, 0]

In [18]:
y_true = list(hep_df.Theory[990:])
y_true

[1, 1, 0, 1, 0, 1, 1, 0, 0, 0]

False