In [13]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

#visualization packages
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns


# Data From NIPS importation

In [23]:
from pathlib import Path 

df = pd.read_json("nips/papers.json")
df.head()


Unnamed: 0,abstract,event_type,id,paper_text,pdf_name,title,year
0,Abstract Missing,,1,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1-self-organization-of-associative-database-an...,Self-Organization of Associative Database and ...,1987
1,Abstract Missing,,10,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...,10-a-mean-field-theory-of-layer-iv-of-visual-c...,A Mean Field Theory of Layer IV of Visual Cort...,1987
10,Abstract Missing,,1007,Learning To Play the Game of Chess\n\nSebastia...,1007-learning-to-play-the-game-of-chess.pdf,Learning to Play the Game of Chess,1994
100,Abstract Missing,,1089,Beating a Defender in Robotic Soccer:\nMemory-...,1089-beating-a-defender-in-robotic-soccer-memo...,Beating a Defender in Robotic Soccer: Memory-B...,1995
1000,Abstract Missing,,1914,A tighter bound for graphical models\n\nM.A.R....,1914-a-tighter-bound-for-graphical-models.pdf,A Tighter Bound for Graphical Models,2000


# Data From ACL importation

In [24]:
import pandas as pd

data = pd.read_csv('2014/paper_ids.txt', sep="\t", header=None)
data.columns = ['id','paper','year']
ids = data.id
from pathlib import Path
papers_ACL = []
for el in ids:
    name = Path('2014/papers_text/'+el+'.txt')
    if name.is_file():   
        with open(name, 'r') as myfile:
            data = myfile.read().replace('\n', '')
            papers_ACL.append([el,data])

# Cleaning function

In [12]:
def cleaning(paper):
    # split into words
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(paper)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # stemming of words
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    lematized = [wordnet_lemmatizer.lemmatize(word) for word in words]
    lematized2 = [wordnet_lemmatizer.lemmatize(word,'v') for word in words]
    # remove words of less than 3 characters
    for word in lematized2:
        if(len(word)<3):
            lematized2.remove(word)  
    return(lematized2)

# Papers from NIPS ready for TFIDF

In [15]:
papers_NIPS_cleaned=[]
papers_NIPS = df.paper_text
for paper in papers_NIPS:
    papers_NIPS_cleaned.append(cleaning(paper))

In [19]:
papers_cleaned=[]
for i in range(0,len(papers_NIPS_cleaned)):
    papers_cleaned.append(' '.join(papers_NIPS_cleaned[i]))

# Papers from ACL ready for TFIDF after joining papers_NIPS

In [22]:
papers_ACL_cleaned=[]
for i in range(0,len(papers_ACL)):
    papers_ACL_cleaned.append(cleaning(papers_ACL[i][1]))

In [23]:
len(papers_ACL_cleaned)

22460

In [25]:
len(papers_cleaned)

6560

In [26]:
for i in range(0,len(papers_ACL_cleaned)):
    papers_cleaned.append(' '.join(papers_ACL_cleaned[i]))

In [29]:
len(papers_cleaned)

29020

# Export and import cleaned papers

In [30]:
import pickle

with open('papers_cleaned.txt', 'wb') as fp:
    pickle.dump(papers_cleaned, fp)

In [15]:
with open ('papers_cleaned.txt', 'rb') as fp:
    papers_cleaned_load = pickle.load(fp)

# TFIDF and NMF

In [16]:
#set parameters 
n_features = 100 #features to use in the tifidf transformation 
n_topics = 8  #number of topics that will be displayed  ###RELEVANT PARAMETER 
n_top_words = 10 #number of words composing each topic 

In [17]:
#tfidf with n_features, removing stop words for English 
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.2,max_features=n_features,stop_words='english')
#fit tdidf with the paper_text
tfidf = tfidf_vectorizer.fit_transform(papers_cleaned_load)

In [18]:
#transforming the tfidf representation into NMF matrix descomposition with n_topics as parameter
nmf = NMF(n_components=n_topics, random_state=0,alpha=.1, l1_ratio=.5).fit(tfidf)
##result in a 8*100 matrix

In [19]:
##Funtion to retrieve the n_top_words from the transformation 
##(taken from https://www.kaggle.com/dschniertshauer)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

# Topics
Topic 0: Information Extraction

Topic 1: Stochastic Methods

Topic 2: Parsing Techniques

Topic 3: Probabilistic Methods

Topic 4: Reinforcement Learning

Topic 5: Translation Techniques

Topic 6: Words Segmentation

Topic 7: Training Neural Networks

In [20]:
##Printing the topics 
print("Topics found via NMF:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topics found via NMF:

Topic #0:
text document information language sentence semantic type corpus task al

Topic #1:
algorithm function learn set sample data problem value result point

Topic #2:
parse rule tree grammar sentence structure syntactic phrase semantic form

Topic #3:
model data train probability language figure state process al time

Topic #4:
feature train label data learn set task al tag test

Topic #5:
translation sentence machine language english phrase score target pair train

Topic #6:
word sense tag corpus lexical english cluster language table context

Topic #7:
network input learn weight train pattern figure time function state



In [21]:
#Finding relation between topics and papers
nmf_embedding = nmf.transform(tfidf)
nmf_embedding = (nmf_embedding - nmf_embedding.mean(axis=0))/nmf_embedding.std(axis=0)


In [25]:
##Getting the titles of the most related papers to the topics, 
##so the decision of the topic name, could be less subjective
top_idx = np.argsort(nmf_embedding,axis=0)[-5:]
##8 topics
count = 0
for idxs in top_idx.T: 
    print("\nTopic {}:".format(count))
    for idx in idxs:
        if(idx>6559):
            print(papers_ACL[idx-6560][0])
        else:
            print(df.iloc[idx]['title'])
    count += 1


Topic 0:
A00-1025
P95-1042
Q14-1012
J02-4005
W14-5210

Topic 1:
Stochastic Variance Reduction Methods for Saddle-Point Problems
Random Walk Approach to Regret Minimization
Stochastic Online AUC Maximization
Bayesian Optimization with Exponential Convergence
Fitted Q-iteration in continuous action-space MDPs

Topic 2:
P83-1018
N03-2011
I05-1013
J83-2002
E89-1003

Topic 3:
J88-3012
Pairwise Choice Markov Chains
Comparing Bayesian models for multisensory cue combination without mandatory integration
W02-0505
N03-2038

Topic 4:
E06-3008
Do Convnets Learn Correspondence?
A00-2028
W11-0313
N09-3010

Topic 5:
W02-0701
P09-4005
N04-4003
C14-1192
N10-1078

Topic 6:
C94-1049
W10-3606
W97-0120
W10-3212
C96-1035

Topic 7:
Learning in Networks of Nondeterministic Adaptive Logic Elements
Dynamically-Adaptive Winner-Take-All Networks
Skeletonization: A Technique for Trimming the Fat from a Network via Relevance Assessment
Learning long-term dependencies is not as difficult with NARX networks
Backpro

# Saving classification of papers

In [212]:
text_file=open("paperSource_paperID_paperTopic.csv","w")
text_file.write("%s,%s,%s\n"%("source","paper_id","topic_id"))
j=0
for i in range(len(papers_cleaned_load)):
    if(i<6560):
        text_file.write("NIPS,%s,%s \n"%(df.iloc[i].id,nmf_embedding.argmax(axis=1)[i]))
        #print("NIPS",",",df.iloc[i].id, ",",nmf_embedding.argmax(axis=1)[i])
    else:
        text_file.write("ALC,%s,%s \n"%(papers_ACL[j][0],nmf_embedding.argmax(axis=1)[i]))
        #print("ALC",",",papers_ACL[j][0],",",nmf_embedding.argmax(axis=1)[i])
        j+=1
text_file.close()    

In [213]:
papers_topics = pd.read_csv('paperSource_paperID_paperTopic.csv')
papers_authors_NIPS = pd.read_csv('Nips/paper_authors.csv')
papers_authors_ACL = pd.read_csv('2014/paper_author_affiliations.txt',sep='\t')
del papers_authors_ACL['affiliation id']

# Creation of a dictionary of authors and their topics

In [214]:
#paper_authors into a dict
papers_authors_NIPS_transform = papers_authors_NIPS.groupby('paper_id')['author_id'].apply(list)
papers_authors_ACL_transform = papers_authors_ACL.groupby('paper id')['author id'].apply(list)

In [246]:
authors_topics_NIPS_todos = {}
authors_topics_ACL_todos = {}
keyCountError = 0
for i in range(0,len(papers_topics)):
        if(papers_topics.iloc[i]['source']=="NIPS"):
            try:
                authors_paper = papers_authors_NIPS_transform[int(papers_topics.iloc[i]['paper_id'])]
                for el in authors_paper:
                    if(el in authors_topics_NIPS_todos):
                        authors_topics_NIPS_todos[el]+=[papers_topics.iloc[i]['topic_id']]
                    else:
                        authors_topics_NIPS_todos[el] = [papers_topics.iloc[i]['topic_id']]
            except KeyError:
                keyCountError+=1
        else:
            try:
                authors_paper = papers_authors_ACL_transform[papers_topics.iloc[i]['paper_id']]
                for el in authors_paper:
                    if(el in authors_topics_ACL_todos):
                        authors_topics_ACL_todos[el]+=[papers_topics.iloc[i]['topic_id']]
                    else:
                        authors_topics_ACL_todos[el] = [papers_topics.iloc[i]['topic_id']]
            except KeyError:
                keyCountError+=1

# Keep most common topic for each author

In [260]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [264]:
authors_topics_NIPS_mc = {}
for el in authors_topics_NIPS_todos:
    authors_topics_NIPS_mc[el] = most_common(authors_topics_NIPS_todos[el])
authors_topics_ACL_mc = {}
for el in authors_topics_ACL_todos:
    authors_topics_ACL_mc[el] = most_common(authors_topics_ACL_todos[el])

# Authors per topic per source

In [303]:
topics_authors_NIPS = {}
for i in range(0,n_topics):
    topics_authors_NIPS[i] = []
for el in authors_topics_NIPS_mc:
    topics_authors_NIPS[authors_topics_NIPS_mc[el]]+=[el]

In [305]:
topics_authors_ACL = {}
for i in range(0,n_topics):
    topics_authors_ACL[i] = []
for el in authors_topics_ACL_mc:
    topics_authors_ACL[authors_topics_ACL_mc[el]]+=[el]

# Export and import authors per topic per source

In [365]:
import pickle

with open('topics_authors_NIPS.txt', 'wb') as fp:
    pickle.dump(topics_authors_NIPS, fp)
    
with open('topics_authors_ACL.txt', 'wb') as fp:
    pickle.dump(topics_authors_ACL, fp)

In [3]:
import pickle
with open ('topics_authors_NIPS.txt', 'rb') as fp:
    topics_authors_NIPS = pickle.load(fp)
    
with open ('topics_authors_ACL.txt', 'rb') as fp:
    topics_authors_ACL = pickle.load(fp)

# Load names

In [4]:
authors_ACL = pd.read_csv('2014/author_ids.txt',sep='\t',header=None)
authors_ACL.columns = ['author_id','author_name']
authors_NIPS = pd.read_csv('Nips/authors.csv')
authors_NIPS.columns = ['author_id','author_name']

In [5]:
#authors_id into a dict
authors_ACL_dict = authors_ACL.groupby('author_id')['author_name'].apply(list)
authors_NIPS_dict = authors_NIPS.groupby('author_id')['author_name'].apply(list)

 # Given an author name from NIPS, return similar authors from ACL

In [6]:
def NIPS_to_ACL(author_name):
    for i in range(0,len(authors_NIPS)):
        if(authors_NIPS.iloc[i]['author_name']==author_name):
            author_id = authors_NIPS.iloc[i]['author_id']
    for i in range(0,n_topics):
        if(author_id in topics_authors_NIPS[i]):
            topic_id = i
            break
    ret = []
    for el in topics_authors_ACL[topic_id]:
        ret.append(authors_ACL_dict[el])
    return ret

In [9]:
NIPS_to_ACL('M. J. Anderson')[0]

['Volk,Martin']

 # Given an author name from ACL, return similar authors from NIPS

In [10]:
def ACL_to_NIPS(author_name):
    for i in range(0,len(authors_ACL)):
        if(authors_ACL.iloc[i]['author_name']==author_name):
            author_id = authors_ACL.iloc[i]['author_id']
    for i in range(0,n_topics):
        if(author_id in topics_authors_ACL[i]):
            topic_id = i
            break
    ret = []
    for el in topics_authors_NIPS[topic_id]:
        ret.append(authors_NIPS_dict[el])
    return ret

In [11]:
ACL_to_NIPS('Szpakowicz,Stan')[0]

['M. J. Anderson']