In [1]:
from lxml import etree as ElementTree

In [3]:
def readXML(xmlPath):
        
        categoryList=[  'BIBM',
                        'RECOMB',
                        'INFOCOM',
                        'SIGCOMM',
                        'SC',
                        'ISCA',
                        'EUROCRYPT',
                        'CRYPTO',
                        'DCC',
                        'CVPR',
                        'ICCV',
                        'ACL',
                        'COLING'
                        ]
        parser=ElementTree.XMLParser(dtd_validation=True)
        #root = xml.etree.ElementTree.parse(xmlPath).getroot()
        root = ElementTree.parse(xmlPath, parser).getroot()
        articles = []
        for category in categoryList:
            cnt = 0
            for paper in root.iter('inproceedings'):
                booktitle = next(paper.iter('booktitle')).text
                
                year = int(next(paper.iter('year')).text)

                # TODO -- the year & book category can be made parameterized
                
                if booktitle == category and year<=1980:   
                    articles.append(paper)
                    cnt += 1
        return articles

In [8]:
topics = {}
for paper in readXML('./dblp.xml'):
                tl = next(paper.iter('title')).text
                key = next(paper.iter('booktitle')).text
                if key not in topics:
                    topics[key] = []
                    topics[key].append(tl)
                    
with open('./dblp.txt','w') as fin:
                   for key,val in topics:
                       for title in val:
                           fin.write(title.strip('\n'))
                           fin.write(' ')
                       fin.write('\n')

ValueError: too many values to unpack (expected 2)

In [9]:
print(topics)

{'ISCA': ['A Bit-Slice Cache Controller.'], 'ACL': ['Paralanguage in Computer Mediated Communication.'], 'COLING': ['An Experimental Applicative Programming Language For Linguistics And String Processing.']}


In [11]:
for key,val in topics.items():
    for title in val:
        print(title)

A Bit-Slice Cache Controller.
Paralanguage in Computer Mediated Communication.
An Experimental Applicative Programming Language For Linguistics And String Processing.


In [36]:
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import sys
import os

import networkx as nx
import nltk
import numpy as np
from nltk.corpus import stopwords
sw = stopwords.words("english")

data_type={}

def filter_word(wrd_freq,topics,vocab):
    #Filter imporatnt words
    for v in topics:
        tmp = []
        for tl in topics[v]:
            tl = [w for w in tl if w in vocab]
            if len(tl) > 0:
                tmp.append(tl)
        topics[v] = tmp
    return topics
    
def compute_tfidf(topics,vocab):
    # Compute tf-idf.
    tf = dict((v, {}) for v in topics)
    for v in topics:
        for tl in topics[v]:
            for w in tl:
                if w in tf[v]:
                    tf[v][w] += 1
                else:
                    tf[v][w] = 1
    idf = dict((w, 0) for w in vocab)
    for w in vocab:
        cnt = 0
        for v in topics:
            if w in tf[v]:
                assert tf[v][w] > 0
                cnt += 1
        idf[w] = np.log(len(topics) / cnt)
    tf_idf = dict((v, {}) for v in topics)
    for v in topics:
        for tl in topics[v]:
            for w in tl:
                tf_idf[v][w] = tf[v][w] * idf[w]
    return tf_idf

def create_network(dir_path,tf_idf,topics):
    '''
        Algorithm to 
    
    '''
    g = nx.Graph()
    for v in topics:
        scores = sorted(tf_idf[v].values())
        #print(scores,v)
        thresh = scores[-int(0.05 * len(scores))] # This is for top 5% from tfidf score
        for tl in topics[v]:
            for x in tl:
                if tf_idf[v][x] >= thresh:
                    for y in tl:
                        if x != y and tf_idf[v][y] >= thresh:
                            g.add_edge(x, y)
                            
    g = next(nx.connected_component_subgraphs(g))
    
    mapping = dict(zip(g, range(g.number_of_nodes())))
    
    with open(os.path.join(dir_path,'db.voc'), 'w',encoding='iso-8859-1') as fout:
        for w, i in mapping.items():
            fout.write('%d %s\n' % (i, w))
            
    g = nx.relabel_nodes(g, mapping)
    
    elist_path=os.path.join(dir_path,'network.txt')
    
    nx.write_edgelist(g, elist_path, data=False)
    
    
def main_title(inputData,dirPath):
    wrd_freq = {}
    topics = {}
    corpus=inputData
    for key,paper in corpus.items():
        for tl in paper:
            tl = nltk.word_tokenize(tl)
            tl = [w.lower() for w in tl]
            for w in tl:
                if w in wrd_freq:
                    wrd_freq[w] += 1
                else:
                    wrd_freq[w] = 0
            if key not in topics:
                topics[key] = []
            topics[key].append(tl)
    
    vocab = set([w for w in wrd_freq])
    topics=filter_word(wrd_freq,topics,vocab)
    tf_idf=compute_tfidf(topics,vocab)
    create_network(dirPath,tf_idf,topics)

In [44]:
corpus={}
cnt=0
with open('./papers.txt','r') as fin:
    for line in fin:
        if line.strip('\n'):
            if cnt not in corpus:
                corpus[cnt]=[]
            corpus[cnt].append(line.strip('\n'))
            cnt = cnt +1
        

In [45]:
main_title(corpus,'./')
#print(corpus[963])

['start research political game university five_year_ago would_take decade happen_say_gonzalo frasca computer_game specialist information technology university copenhagen admit first_person surprise fast evolve add uruguayan-born researcher create game political campaign artist designer experiment form game agenda project newsgaming aim comment international news event game']
