In [4]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

#visualization packages
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns


# Data From NIPS importation

In [11]:
from pathlib import Path 

df = pd.read_json("nips/papers.json")
df.head()


Unnamed: 0,abstract,event_type,id,paper_text,pdf_name,title,year
0,Abstract Missing,,1,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,1-self-organization-of-associative-database-an...,Self-Organization of Associative Database and ...,1987
1,Abstract Missing,,10,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...,10-a-mean-field-theory-of-layer-iv-of-visual-c...,A Mean Field Theory of Layer IV of Visual Cort...,1987
10,Abstract Missing,,1007,Learning To Play the Game of Chess\n\nSebastia...,1007-learning-to-play-the-game-of-chess.pdf,Learning to Play the Game of Chess,1994
100,Abstract Missing,,1089,Beating a Defender in Robotic Soccer:\nMemory-...,1089-beating-a-defender-in-robotic-soccer-memo...,Beating a Defender in Robotic Soccer: Memory-B...,1995
1000,Abstract Missing,,1914,A tighter bound for graphical models\n\nM.A.R....,1914-a-tighter-bound-for-graphical-models.pdf,A Tighter Bound for Graphical Models,2000


# Data From ACL importation

In [6]:
import pandas as pd

data = pd.read_csv('2014/paper_ids.txt', sep="\t", header=None)
data.columns = ['id','paper','year']
ids = data.id
from pathlib import Path
papers_ACL = []
for el in ids:
    name = Path('2014/papers_text/'+el+'.txt')
    if name.is_file():   
        with open(name, 'r') as myfile:
            data = myfile.read().replace('\n', '')
            papers_ACL.append([el,data])
papers_ACL[0]

['W09-2307',
 'Proceedings of SSST-3, Third Workshop on Syntax and Structure in Statistical Translation, pages 51?59,Boulder, Colorado, June 2009. c?2009 Association for Computational LinguisticsDiscriminative Reordering with Chinese Grammatical Relations FeaturesPi-Chuan Changa, Huihsin Tsengb, Dan Jurafskya, and Christopher D. ManningaaComputer Science Department, Stanford University, Stanford, CA 94305bYahoo! Inc., Santa Clara, CA 95054{pichuan,jurafsky,manning}@stanford.edu, huihui@yahoo-inc.comAbstractThe prevalence in Chinese of grammaticalstructures that translate into English in dif-ferent word orders is an important cause oftranslation difficulty. While previous work hasused phrase-structure parses to deal with suchordering problems, we introduce a richer set ofChinese grammatical relations that describesmore semantically abstract relations betweenwords. Using these Chinese grammatical re-lations, we improve a phrase orientation clas-sifier (introduced by Zens and Ney (2006))t

# Cleaning function

In [12]:
def cleaning(paper):
    # split into words
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(paper)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # stemming of words
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    lematized = [wordnet_lemmatizer.lemmatize(word) for word in words]
    lematized2 = [wordnet_lemmatizer.lemmatize(word,'v') for word in words]
    # remove words of less than 3 characters
    for word in lematized2:
        if(len(word)<3):
            lematized2.remove(word)  
    return(lematized2)

# Papers from NIPS ready for TFIDF

In [15]:
papers_NIPS_cleaned=[]
papers_NIPS = df.paper_text
for paper in papers_NIPS:
    papers_NIPS_cleaned.append(cleaning(paper))

In [19]:
papers_cleaned=[]
for i in range(0,len(papers_NIPS_cleaned)):
    papers_cleaned.append(' '.join(papers_NIPS_cleaned[i]))

In [21]:
papers_ACL[0]

['W09-2307',
 'Proceedings of SSST-3, Third Workshop on Syntax and Structure in Statistical Translation, pages 51?59,Boulder, Colorado, June 2009. c?2009 Association for Computational LinguisticsDiscriminative Reordering with Chinese Grammatical Relations FeaturesPi-Chuan Changa, Huihsin Tsengb, Dan Jurafskya, and Christopher D. ManningaaComputer Science Department, Stanford University, Stanford, CA 94305bYahoo! Inc., Santa Clara, CA 95054{pichuan,jurafsky,manning}@stanford.edu, huihui@yahoo-inc.comAbstractThe prevalence in Chinese of grammaticalstructures that translate into English in dif-ferent word orders is an important cause oftranslation difficulty. While previous work hasused phrase-structure parses to deal with suchordering problems, we introduce a richer set ofChinese grammatical relations that describesmore semantically abstract relations betweenwords. Using these Chinese grammatical re-lations, we improve a phrase orientation clas-sifier (introduced by Zens and Ney (2006))t

# Papers from ACL ready for TFIDF after joining papers_NIPS

In [22]:
papers_ACL_cleaned=[]
for i in range(0,len(papers_ACL)):
    papers_ACL_cleaned.append(cleaning(papers_ACL[i][1]))

In [23]:
len(papers_ACL_cleaned)

22460

In [25]:
len(papers_cleaned)

6560

In [26]:
for i in range(0,len(papers_ACL_cleaned)):
    papers_cleaned.append(' '.join(papers_ACL_cleaned[i]))

In [29]:
len(papers_cleaned)

29020

# Export and import cleaned papers

In [30]:
import pickle

with open('papers_cleaned.txt', 'wb') as fp:
    pickle.dump(papers_cleaned, fp)

In [31]:
with open ('papers_cleaned.txt', 'rb') as fp:
    papers_cleaned_load = pickle.load(fp)

# TFIDF and NMF

In [35]:
#set parameters 
n_features = 100 #features to use in the tifidf transformation 
n_topics = 8  #number of topics that will be displayed  ###RELEVANT PARAMETER 
n_top_words = 10 #number of words composing each topic 

In [36]:
#tfidf with n_features, removing stop words for English 
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.2,max_features=n_features,stop_words='english')
#fit tdidf with the paper_text
tfidf = tfidf_vectorizer.fit_transform(papers_cleaned_load)

In [37]:
#transforming the tfidf representation into NMF matrix descomposition with n_topics as parameter
nmf = NMF(n_components=n_topics, random_state=0,alpha=.1, l1_ratio=.5).fit(tfidf)
##result in a 8*100 matrix

In [38]:
##Funtion to retrieve the n_top_words from the transformation 
##(taken from https://www.kaggle.com/dschniertshauer)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

# Topics
Topic 0: Information Extraction

Topic 1: Stochastic Methods

Topic 2: Parsing Techniques

Topic 3: Probabilistic Methods

Topic 4: Reinforcement Learning

Topic 5: Translation Techniques

Topic 6: Words Segmentation

Topic 7: Training Neural Networks

In [39]:
##Printing the topics 
print("Topics found via NMF:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topics found via NMF:

Topic #0:
text document information language sentence semantic type corpus task al

Topic #1:
algorithm function learn set sample data problem value result point

Topic #2:
parse rule tree grammar sentence structure syntactic phrase semantic form

Topic #3:
model data train probability language figure state process al time

Topic #4:
feature train label data learn set task al tag test

Topic #5:
translation sentence machine language english phrase score target pair train

Topic #6:
word sense tag corpus lexical english cluster language table context

Topic #7:
network input learn weight train pattern figure time function state



In [42]:
#Finding relation between topics and papers
nmf_embedding = nmf.transform(tfidf)
nmf_embedding = (nmf_embedding - nmf_embedding.mean(axis=0))/nmf_embedding.std(axis=0)


In [52]:
##Getting the titles of the most related papers to the topics, 
##so the decision of the topic name, could be less subjective
top_idx = np.argsort(nmf_embedding,axis=0)[-10:]
##8 topics
count = 0
for idxs in top_idx.T: 
    print("\nTopic {}:".format(count))
    for idx in idxs:
        if(idx>6559):
            print(papers_ACL[idx-6560][0])
        else:
            print(df.iloc[idx]['title'])
    count += 1


Topic 0:
W14-4504
J98-3005
W13-2110
J02-4001
P12-1077
A00-1025
P95-1042
Q14-1012
J02-4005
W14-5210

Topic 1:
High-Dimensional Gaussian Process Bandits
Stochastic Gradient Methods for Distributionally Robust Optimization with f-divergences
NESTT: A Nonconvex Primal-Dual Splitting Method for Distributed and Stochastic Optimization
Stochastic Three-Composite Convex Minimization
Stochastic optimization and sparse statistical recovery: Optimal algorithms for high dimensions
Stochastic Variance Reduction Methods for Saddle-Point Problems
Random Walk Approach to Regret Minimization
Stochastic Online AUC Maximization
Bayesian Optimization with Exponential Convergence
Fitted Q-iteration in continuous action-space MDPs

Topic 2:
W12-6218
E85-1019
P96-1011
H94-1051
W04-1503
P83-1018
N03-2011
I05-1013
J83-2002
E89-1003

Topic 3:
P97-1049
P10-1102
J88-3014
P12-4003
Evidence for a Forward Dynamics Model in Human Adaptive Motor Control
J88-3012
Pairwise Choice Markov Chains
Comparing Bayesian models

# Saving classification of papers

In [212]:
text_file=open("paperSource_paperID_paperTopic.csv","w")
text_file.write("%s,%s,%s\n"%("source","paper_id","topic_id"))
j=0
for i in range(len(papers_cleaned_load)):
    if(i<6560):
        text_file.write("NIPS,%s,%s \n"%(df.iloc[i].id,nmf_embedding.argmax(axis=1)[i]))
        #print("NIPS",",",df.iloc[i].id, ",",nmf_embedding.argmax(axis=1)[i])
    else:
        text_file.write("ALC,%s,%s \n"%(papers_ACL[j][0],nmf_embedding.argmax(axis=1)[i]))
        #print("ALC",",",papers_ACL[j][0],",",nmf_embedding.argmax(axis=1)[i])
        j+=1
text_file.close()    

In [213]:
papers_topics = pd.read_csv('paperSource_paperID_paperTopic.csv')
papers_authors_NIPS = pd.read_csv('Nips/paper_authors.csv')
papers_authors_ACL = pd.read_csv('2014/paper_author_affiliations.txt',sep='\t')
del papers_authors_ACL['affiliation id']

# Creation of a dictionary of authors and their topics

In [214]:
#paper_authors into a dict
papers_authors_NIPS_transform = papers_authors_NIPS.groupby('paper_id')['author_id'].apply(list)
papers_authors_ACL_transform = papers_authors_ACL.groupby('paper id')['author id'].apply(list)

In [239]:
authors_topics_NIPS = {}
authors_topics_ACL = {}
keyCountError = 0
for i in range(0,len(papers_topics)):
        if(papers_topics.iloc[i]['source']=="NIPS"):
            try:
                authors_paper = papers_authors_NIPS_transform[int(papers_topics.iloc[i]['paper_id'])]
                for el in authors_paper:
                    if(el in authors_topics_NIPS):
                        if(papers_topics.iloc[i]['topic_id'] not in authors_topics_NIPS[el]):
                            authors_topics_NIPS[el]+=[papers_topics.iloc[i]['topic_id']]
                    else:
                        authors_topics_NIPS[el] = [papers_topics.iloc[i]['topic_id']]
            except KeyError:
                keyCountError+=1
        else:
            try:
                authors_paper = papers_authors_ACL_transform[papers_topics.iloc[i]['paper_id']]
                for el in authors_paper:
                    if(el in authors_topics_ACL):
                        if(papers_topics.iloc[i]['topic_id'] not in authors_topics_ACL[el]):
                            authors_topics_ACL[el]+=[papers_topics.iloc[i]['topic_id']]
                    else:
                        authors_topics_ACL[el] = [papers_topics.iloc[i]['topic_id']]
            except KeyError:
                keyCountError+=1

In [246]:
authors_topics_NIPS_todos = {}
authors_topics_ACL_todos = {}
keyCountError = 0
for i in range(0,len(papers_topics)):
        if(papers_topics.iloc[i]['source']=="NIPS"):
            try:
                authors_paper = papers_authors_NIPS_transform[int(papers_topics.iloc[i]['paper_id'])]
                for el in authors_paper:
                    if(el in authors_topics_NIPS_todos):
                        authors_topics_NIPS_todos[el]+=[papers_topics.iloc[i]['topic_id']]
                    else:
                        authors_topics_NIPS_todos[el] = [papers_topics.iloc[i]['topic_id']]
            except KeyError:
                keyCountError+=1
        else:
            try:
                authors_paper = papers_authors_ACL_transform[papers_topics.iloc[i]['paper_id']]
                for el in authors_paper:
                    if(el in authors_topics_ACL_todos):
                        authors_topics_ACL_todos[el]+=[papers_topics.iloc[i]['topic_id']]
                    else:
                        authors_topics_ACL_todos[el] = [papers_topics.iloc[i]['topic_id']]
            except KeyError:
                keyCountError+=1

In [240]:
authors_topics_NIPS

{1: [1],
 2: [1],
 14: [7],
 539: [7, 1, 3],
 1312: [1],
 1313: [1],
 2106: [7],
 1632: [7, 1],
 2107: [7],
 757: [3, 7],
 890: [3],
 265: [4, 7, 3],
 350: [4, 7, 3, 1, 2],
 6950: [4],
 2108: [7, 1, 6],
 1969: [7, 1],
 1484: [7, 1],
 45: [7],
 46: [7, 4],
 2109: [7, 3],
 178: [7, 1, 3, 4, 6],
 2110: [7],
 1874: [7, 1],
 2111: [7],
 1215: [1, 3],
 2112: [1],
 1796: [1, 3, 4],
 1666: [1, 3, 4],
 175: [7],
 7108: [7],
 916: [7, 3, 1],
 1147: [7],
 1064: [1],
 2113: [1],
 1254: [3, 1, 7],
 1020: [3, 1, 4, 7],
 1302: [3, 7, 4, 1],
 283: [3, 7, 1],
 1004: [3, 4, 7, 1],
 1828: [1, 7],
 2114: [1],
 1019: [1, 2, 7, 3],
 1945: [3, 1],
 324: [7, 3, 1],
 202: [7, 3],
 1720: [1, 4],
 1722: [1, 4],
 1282: [1, 4, 7, 3],
 2115: [3],
 998: [3, 1, 4, 7, 0],
 1047: [7, 1, 4],
 1006: [7],
 988: [7],
 987: [7],
 2116: [1],
 2117: [1],
 2118: [1],
 1816: [7, 1],
 1946: [7, 1, 3, 4],
 1474: [7, 1, 3],
 2119: [1],
 2120: [1],
 1629: [1],
 1603: [1],
 2121: [1],
 2122: [1],
 2123: [1],
 1263: [3, 1, 7],
 2124:

In [241]:
authors_topics_ACL

{3835: [4, 3, 5, 6, 2, 0, 1],
 5147: [4, 6, 5, 0, 2, 3, 7, 1],
 11482: [4, 5, 6, 3, 2],
 8298: [4, 6, 0],
 3329: [6, 0, 3, 4, 5],
 5669: [6],
 8515: [7],
 7136: [2, 5, 6, 4],
 8604: [2, 5, 0, 1, 7],
 1610: [2, 7, 0, 4, 1, 6],
 3393: [2, 4, 0],
 12073: [2, 0, 4],
 7391: [5, 0, 2, 7],
 8775: [5, 0, 2, 7],
 1273: [6],
 1680: [6, 4, 0, 5],
 1235: [6, 4, 5, 0, 3, 7, 2, 1],
 3424: [6, 4, 0],
 4609: [6, 4, 0],
 13039: [5],
 13885: [5, 6, 2],
 15487: [5],
 15586: [5, 2],
 5343: [6, 5, 4, 7, 0, 3, 1, 2],
 5968: [6, 4, 1, 7, 2],
 9791: [6, 4, 0, 7, 1, 2],
 8024: [6, 4, 2, 1, 3, 7, 0, 5],
 13399: [5, 0, 4],
 13842: [5, 6, 4, 0, 7, 3, 2],
 3628: [2, 1, 0],
 8367: [2],
 4004: [5, 6, 4],
 6051: [5, 4, 7, 3, 0],
 8009: [5, 3],
 14998: [5, 2],
 15517: [5, 2],
 15601: [5],
 5634: [3, 5, 6, 4, 7, 2],
 3276: [0],
 5083: [0],
 284: [0, 6, 4],
 4084: [0, 2],
 4799: [0],
 5665: [0, 4],
 6869: [0],
 4831: [3, 5, 6, 0, 2, 4, 1],
 7318: [3, 2, 7, 6, 4],
 8683: [3, 2, 4],
 644: [0, 4, 2, 7, 3, 5],
 4008: [1],
 

In [247]:
authors_topics_NIPS_todos

{1: [1],
 2: [1],
 14: [7, 7],
 539: [7, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 7, 7, 1, 7, 1, 1, 1, 1],
 1312: [1],
 1313: [1],
 2106: [7],
 1632: [7, 1, 7, 7, 1, 1, 7, 7],
 2107: [7],
 757: [3, 3, 7, 7, 3, 7, 7],
 890: [3, 3, 3, 3],
 265: [4, 7, 3, 7, 7, 3],
 350: [4, 7, 7, 3, 1, 3, 3, 1, 1, 2, 3, 3, 3, 3, 3, 7],
 6950: [4],
 2108: [7, 1, 1, 1, 1, 1, 6, 1, 1, 1],
 1969: [7, 1],
 1484: [7, 7, 1, 7, 1],
 45: [7, 7],
 46: [7, 7, 7, 7, 4, 7, 7, 7, 7, 7, 7, 7],
 2109: [7, 3],
 178: [7,
  7,
  7,
  1,
  3,
  7,
  1,
  7,
  7,
  1,
  1,
  1,
  7,
  3,
  7,
  1,
  1,
  1,
  7,
  1,
  7,
  1,
  3,
  3,
  4,
  7,
  7,
  1,
  7,
  1,
  3,
  4,
  1,
  3,
  7,
  3,
  7,
  7,
  7,
  7,
  7,
  3,
  7,
  7,
  7,
  3,
  7,
  7,
  7,
  7,
  7,
  7,
  1,
  1,
  7,
  6],
 2110: [7],
 1874: [7, 1],
 2111: [7],
 1215: [1, 3],
 2112: [1],
 1796: [1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 4, 1],
 1666: [1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 4, 1],
 175: [7, 7],
 7108: [7, 7],
 916: [7, 3, 7, 7, 1, 7, 7, 7],
 1147: [7, 7, 7, 7],
 

In [248]:
authors_topics_ACL_todos

{3835: [4,
  3,
  5,
  5,
  4,
  4,
  4,
  6,
  4,
  6,
  2,
  6,
  4,
  3,
  0,
  1,
  4,
  2,
  6,
  6,
  3,
  3,
  0,
  4,
  6,
  0,
  2,
  5,
  4,
  3,
  6,
  4,
  4,
  6,
  2,
  4,
  1,
  5,
  4,
  6,
  4,
  4,
  4,
  4,
  4,
  6,
  4,
  0,
  0,
  6,
  2,
  5,
  4,
  2,
  2,
  2,
  6,
  5,
  1,
  3,
  0,
  3,
  5,
  0,
  4,
  6,
  3,
  0,
  2,
  0],
 5147: [4,
  6,
  5,
  5,
  0,
  0,
  4,
  4,
  0,
  6,
  4,
  4,
  0,
  2,
  2,
  0,
  3,
  6,
  4,
  2,
  0,
  2,
  4,
  4,
  7,
  3,
  3,
  6,
  2,
  2,
  3,
  2,
  4,
  3,
  2,
  4,
  0,
  3,
  2,
  0,
  4,
  2,
  2,
  2,
  3,
  4,
  6,
  6,
  0,
  3,
  3,
  2,
  3,
  3,
  4,
  5,
  3,
  4,
  2,
  3,
  5,
  5,
  3,
  5,
  0,
  4,
  4,
  4,
  2,
  4,
  5,
  5,
  2,
  2,
  4,
  3,
  3,
  2,
  2,
  0,
  4,
  5,
  5,
  3,
  0,
  3,
  6,
  0,
  3,
  2,
  6,
  4,
  4,
  5,
  5,
  6,
  2,
  7,
  0,
  5,
  5,
  6,
  0,
  4,
  2,
  3,
  4,
  6,
  7,
  0,
  3,
  5,
  3,
  4,
  0,
  4,
  7,
  4,
  0,
  0,
  7,
  5,
  5,
  4,
  1,
  4,
  5,
  

In [242]:
keyCountError

5886

# Keep most common topic for each author

In [260]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [264]:
authors_topics_NIPS_mc = {}
for el in authors_topics_NIPS_todos:
    authors_topics_NIPS_mc[el] = most_common(authors_topics_NIPS_todos[el])
authors_topics_ACL_mc = {}
for el in authors_topics_ACL_todos:
    authors_topics_ACL_mc[el] = most_common(authors_topics_ACL_todos[el])

In [263]:
authors_topics_NIPS_mc

{1: 1,
 2: 1,
 14: 7,
 539: 1,
 1312: 1,
 1313: 1,
 2106: 7,
 1632: 7,
 2107: 7,
 757: 7,
 890: 3,
 265: 7,
 350: 3,
 6950: 4,
 2108: 1,
 1969: 1,
 1484: 7,
 45: 7,
 46: 7,
 2109: 3,
 178: 7,
 2110: 7,
 1874: 1,
 2111: 7,
 1215: 1,
 2112: 1,
 1796: 1,
 1666: 1,
 175: 7,
 7108: 7,
 916: 7,
 1147: 7,
 1064: 1,
 2113: 1,
 1254: 3,
 1020: 3,
 1302: 1,
 283: 3,
 1004: 3,
 1828: 1,
 2114: 1,
 1019: 1,
 1945: 1,
 324: 7,
 202: 7,
 1720: 1,
 1722: 1,
 1282: 1,
 2115: 3,
 998: 7,
 1047: 1,
 1006: 7,
 988: 7,
 987: 7,
 2116: 1,
 2117: 1,
 2118: 1,
 1816: 1,
 1946: 1,
 1474: 1,
 2119: 1,
 2120: 1,
 1629: 1,
 1603: 1,
 2121: 1,
 2122: 1,
 2123: 1,
 1263: 1,
 2124: 3,
 2125: 3,
 1173: 1,
 1201: 3,
 2126: 1,
 7067: 1,
 7068: 1,
 7069: 1,
 2127: 3,
 2128: 7,
 1630: 1,
 1814: 1,
 1310: 1,
 770: 1,
 1311: 1,
 54: 7,
 2592: 1,
 1602: 1,
 1635: 7,
 2129: 7,
 2130: 3,
 2131: 3,
 3022: 1,
 1278: 1,
 2132: 3,
 2133: 1,
 1582: 1,
 736: 1,
 1518: 1,
 2134: 1,
 2135: 1,
 2136: 1,
 2137: 1,
 2138: 1,
 2139: 1,


In [265]:
authors_topics_ACL_mc

{3835: 4,
 5147: 4,
 11482: 4,
 8298: 4,
 3329: 6,
 5669: 6,
 8515: 7,
 7136: 2,
 8604: 0,
 1610: 2,
 3393: 2,
 12073: 0,
 7391: 5,
 8775: 2,
 1273: 6,
 1680: 6,
 1235: 0,
 3424: 0,
 4609: 0,
 13039: 5,
 13885: 5,
 15487: 5,
 15586: 5,
 5343: 6,
 5968: 4,
 9791: 4,
 8024: 4,
 13399: 5,
 13842: 4,
 3628: 0,
 8367: 2,
 4004: 5,
 6051: 5,
 8009: 5,
 14998: 5,
 15517: 5,
 15601: 5,
 5634: 3,
 3276: 0,
 5083: 0,
 284: 0,
 4084: 0,
 4799: 0,
 5665: 0,
 6869: 0,
 4831: 4,
 7318: 3,
 8683: 2,
 644: 0,
 4008: 1,
 4875: 4,
 7577: 1,
 8643: 4,
 710: 6,
 3382: 6,
 6114: 0,
 3662: 0,
 5243: 0,
 7492: 0,
 3594: 6,
 11048: 0,
 13390: 4,
 13848: 3,
 4197: 0,
 4837: 0,
 7533: 6,
 7176: 6,
 11439: 6,
 2909: 0,
 3734: 0,
 7015: 4,
 10518: 5,
 10833: 5,
 12259: 6,
 12323: 6,
 3615: 6,
 1198: 5,
 7692: 5,
 592: 5,
 9558: 0,
 11890: 5,
 11574: 5,
 9636: 5,
 3545: 5,
 4636: 6,
 6862: 4,
 7626: 4,
 9925: 4,
 1972: 0,
 8010: 0,
 8815: 0,
 13323: 6,
 14428: 0,
 9591: 0,
 14298: 0,
 372: 0,
 2113: 0,
 2164: 6,
 

In [253]:
len(authors_topics_ACL)

14616

In [254]:
len(authors_topics_NIPS)

8653

# Authors per topic per source

In [303]:
topics_authors_NIPS = {}
for i in range(0,n_topics):
    topics_authors_NIPS[i] = []
for el in authors_topics_NIPS_mc:
    topics_authors_NIPS[authors_topics_NIPS_mc[el]]+=[el]

In [305]:
topics_authors_ACL = {}
for i in range(0,n_topics):
    topics_authors_ACL[i] = []
for el in authors_topics_ACL_mc:
    topics_authors_ACL[authors_topics_ACL_mc[el]]+=[el]

# Export and import authors per topic per source

In [365]:
import pickle

with open('topics_authors_NIPS.txt', 'wb') as fp:
    pickle.dump(topics_authors_NIPS, fp)
    
with open('topics_authors_ACL.txt', 'wb') as fp:
    pickle.dump(topics_authors_ACL, fp)

In [366]:
with open ('topics_authors_NIPS.txt', 'rb') as fp:
    topics_authors_NIPS = pickle.load(fp)
    
with open ('topics_authors_ACL.txt', 'rb') as fp:
    topics_authors_ACL = pickle.load(fp)

# Load names

In [367]:
authors_ACL = pd.read_csv('2014/author_ids.txt',sep='\t',header=None)
authors_ACL.columns = ['author_id','author_name']
authors_NIPS = pd.read_csv('Nips/authors.csv')
authors_NIPS.columns = ['author_id','author_name']

In [368]:
#authors_id into a dict
authors_ACL_dict = authors_ACL.groupby('author_id')['author_name'].apply(list)
authors_NIPS_dict = authors_NIPS.groupby('author_id')['author_name'].apply(list)

 # Given an author name from NIPS, return similar authors from ACL

In [369]:
def NIPS_to_ACL(author_name):
    for i in range(0,len(authors_NIPS)):
        if(authors_NIPS.iloc[i]['author_name']==author_name):
            author_id = authors_NIPS.iloc[i]['author_id']
    for i in range(0,n_topics):
        if(author_id in topics_authors_NIPS[i]):
            topic_id = i
            break
    ret = []
    for el in topics_authors_ACL[topic_id]:
        ret.append(authors_ACL_dict[el])
    return ret

In [370]:
NIPS_to_ACL('M. J. Anderson')

[['Volk,Martin'],
 ['Nothman,Joel'],
 ['Chen,Hsin-Hsi'],
 ['Hou,Wen-Juan'],
 ['Lee,Chih'],
 ['Ishizaki,Shun'],
 ['Hess,Michael'],
 ['Mahlow,Cerstin'],
 ['Aronson,Alan R.'],
 ['Kilicoglu,Halil'],
 ['Libbus,Bisharah'],
 ['Mork,James G.'],
 ['Rindflesch,Thomas C.'],
 ['Bird,Steven'],
 ['Okumura,Manabu'],
 ['Iwayama,Makoto'],
 ['Marukawa,Yuzo'],
 ['Shinmori,Akihiro'],
 ['Szpakowicz,Stan'],
 ['Klavans,Judith L.'],
 ['Lin,Jimmy'],
 ['Grishman,Ralph'],
 ['Ji,Heng'],
 ['Bigi,Brigitte'],
 ['Doyon,Jennifer B.'],
 ['Talbott,Susan W.'],
 ['White,John S.'],
 ['Syed,Zareen'],
 ['Braffort,Annelies'],
 ['Segouat,J&eacute;r&eacute;mie'],
 ['Baker,Collin F.'],
 ['Ellsworth,Michael'],
 ['Hasida,Koiti'],
 ['Itahashi,Shuichi'],
 ['Gaizauskas,Robert J.'],
 ['Setzer,Andrea'],
 ['Bos,Johan'],
 ['Rigau,German'],
 ['Magnini,Bernardo'],
 ['Vossen,Piek'],
 ['Weerkamp,Wouter'],
 ['de Rijke,Maarten'],
 ['Cieri,Christopher'],
 ['Avanzi,Mathieu'],
 ['Lacheret-Dujour,Anne'],
 ['Obin,Nicolas'],
 ['Lee,Lillian'],
 ['Gia

 # Given an author name from ACL, return similar authors from NIPS

In [371]:
def ACL_to_NIPS(author_name):
    for i in range(0,len(authors_ACL)):
        if(authors_ACL.iloc[i]['author_name']==author_name):
            author_id = authors_ACL.iloc[i]['author_id']
    for i in range(0,n_topics):
        if(author_id in topics_authors_ACL[i]):
            topic_id = i
            break
    ret = []
    for el in topics_authors_NIPS[topic_id]:
        ret.append(authors_NIPS_dict[el])
    return ret

In [372]:
ACL_to_NIPS('Szpakowicz,Stan')

[['M. J. Anderson'],
 ['E. D. Young'],
 ['Israel Nelken'],
 ['Jorg Ontrup'],
 ['Matthew Richardson'],
 ['B. T. Backus'],
 ['Charles Lee Isbell Jr.'],
 ['David Cohn'],
 ['John P. Miller'],
 ['Bin Wu'],
 ['David Bodoff'],
 ['Brochu Eric'],
 ['Alexei Vinokourov'],
 ['Tai Sing Lee'],
 ['Jun Suzuki'],
 ['Yutaka Sasaki'],
 ['Marcelo A. Montemurro'],
 ['Stefano Panzeri'],
 ['Tamara L. Berg'],
 ['Ron Papka'],
 ['James P. Callan'],
 ['Deepak Verma'],
 ['Karl Pfleger'],
 ['Moritz Grosse-wentrup'],
 ['Erik Linstead'],
 ['Paul Rigor'],
 ['Sushil Bajracharya'],
 ['Cristina Lopes'],
 ['Dominik Endres'],
 ['Peter Foldiak'],
 ['Tao Qin'],
 ['Xu-dong Zhang'],
 ['De-sheng Wang'],
 ['Yi Zhang'],
 ['Artur Dubrawski'],
 ['Victor Zue'],
 ['James Glass'],
 ['David Goodine'],
 ['Lynette Hirschman'],
 ['Hong Leung'],
 ['Michael Phillips'],
 ['Joseph Polifroni'],
 ['Stephanie Seneff'],
 ['Stella X. Yu'],
 ['Akaysha C. Tang'],
 ['Vicente Ordonez'],
 ['Girish Kulkarni'],
 ['Min Xiao'],
 ['Mateusz Malinowski'],
 [