In [1]:
import csv
import pickle
import re  # string processing


In [2]:
# csv files
csv_path = 'csv\\'

csv_files = ['2012EPCG','2011SIACG','2010Interacao','2009EPCG','2008Interacao']
#csv_files = ['2008Interacao']
csv_extension = '.csv'



In [3]:
# pkl files and data structures
pkl_path = 'pkl\\'

# load the events data structure (if it exists)
events_fn = 'events.pkl'
events_modified = False

# verify if the file exists
try:
    with open(pkl_path+events_fn,"rb") as events_in:
        events = pickle.load(events_in)
except FileNotFoundError:
    events = []
print ('There are {0} events!'.format(len(events)))
#print (events)

# load the articles data structure (if it exists)
articles_fn = 'articles.pkl'
articles_modified = False

# verify if the file exists
try:
    with open(pkl_path+articles_fn,"rb") as articles_in:
        articles = pickle.load(articles_in)
except FileNotFoundError:
    articles = []
print ('There are {0} articles!'.format(len(articles)))
#print (articles)

# load the authors data structure (if it exists)
authors_fn = 'authors.pkl'
authors_modified = False

# verify if the file exists
try:
    with open(pkl_path+authors_fn,"rb") as authors_in:
        authors = pickle.load(authors_in)
except FileNotFoundError:
    authors = []
print ('There are {0} authors!'.format(len(authors)))
#print (authors)

# load the articles/authors data structure (if it exists)
art_aut_fn = 'art_aut.pkl'
art_aut_modified = False

# verify if the file exists
try:
    with open(pkl_path+art_aut_fn,"rb") as art_aut_in:
        art_aut = pickle.load(art_aut_in)
except FileNotFoundError:
    art_aut = []
print ('There are {0} articles-authors relationships!'.format(len(art_aut)))
#print (art_aut)

# load the forms data structure (if it exists)
forms_fn = 'forms.pkl'
forms_modified = False

# verify if the file exists
try:
    with open(pkl_path+forms_fn,"rb") as forms_in:
        forms = pickle.load(forms_in)
except FileNotFoundError:
    forms = []
print ('There are {0} forms!'.format(len(forms)))
#print (art_aut)


There are 0 events!
There are 0 articles!
There are 0 authors!
There are 0 articles-authors relationships!
There are 0 forms!


In [4]:
import time
import random
import hashlib
import unicodedata

# Use this function to generate a unique ID for each author
# use name as arg to generate more randomness

def guid( arg ):
    """
    Generates a universally unique ID.
    Any arguments only create more randomness.
    """
    t = int( time.time() * 1000 )
    r = int( random.random()*10000000 )
    data = str(t)+' '+str(r)+' '+arg
    data = unicodedata.normalize('NFKD', data).encode('ascii', 'ignore')
    #print (data, flush=True)
    data = hashlib.md5(data).hexdigest()

    return data

In [5]:
# main loop
for csv_file in csv_files:
    print ('*** Processing file {0} ***'.format(csv_file))
    
    csv_fileN = csv_path + csv_file + csv_extension    
    # read the csv file
    try:
        with open(csv_fileN,'rt') as csv_in:
            
            # Loop remaining rows
            for rownum, row in enumerate(csv.reader(csv_in, delimiter=";")):

                # for the first row get indexes of the required fields
                if rownum==0:
                    # event
                    Acronym_col = row.index ('Acronym')
                    Year_col = row.index ('Year')
                    Event_col = row.index ('Event')
                    # article
                    Type_col = row.index ('Type')
                    Title_col = row.index ('Title')
                    File_col = row.index ('File')
                    ArticleKey_col = row.index ('ArticleKey')
                    # authors col indexes
                    authors_col = []
                    au_n = 1
                    finish = False
                    while not finish:
                        LN = 'Author' + str(au_n) + 'LastName'
                        FN = 'Author' + str(au_n) + 'FirstName'
                        Contact = 'Author' + str(au_n) + 'Contact'
                        try:
                            LN_col = row.index (LN)
                            FN_col = row.index (FN)
                            contact_col = row.index (Contact)
                            authors_col.append((LN_col, FN_col, contact_col))
                            au_n += 1
                        except ValueError:
                            finish = True
                    continue
            
                # for the first data row verify the event
                if rownum==1:
                    Acronym = row[Acronym_col]
                    # verify if this event already exists in the events data structure
                    event_exists = False
                    for event in events:
                        if event['acr']==Acronym:
                            event_exists = True
                            break
                    if not event_exists:  # append event to events
                        event_dict = {'acr':Acronym, 'name':row[Event_col],
                                      'year':row[Year_col], 'state': 'loaded'}
                        events.append(event_dict)
                        events_modified = True
                        
                # for the current row (article)
                if row[Type_col]!='Article':
                    continue
                # article data
                ArticleKey = row[ArticleKey_col]
                print ("Processing article ... ", ArticleKey, end = ' ')
                # verify if this article already exists in the articles data structure
                article_exists = False
                for article in articles:
                    if article['ID']==ArticleKey:
                        article_exists = True
                        break
                if not article_exists:  # append event to articles
                    Acronym = row[Acronym_col]
                    Title = row[Title_col]
                    File = row[File_col]
                    art_dict = {'ID':ArticleKey, 'event':Acronym,
                           'title':Title, 'file':File, 'state':'loaded'}
                    articles.append(art_dict)
                    articles_modified = True
                    print ("added to articles")
                else:
                    print ("already in articles")
                # authors data
                this_authors = []
                for au in authors_col:
                    LN_col = au[0]
                    FN_col = au[1]
                    contact_col = au[2]
                    if row[LN_col]=="": break  # last author was previous
                    LN = row[LN_col]
                    LN = LN.strip()
                    LN = re.sub("\s\s+", " ", LN)
                    FN = row[FN_col]
                    FN = FN.strip()
                    FN = re.sub("\s\s+", " ", FN)
                    Name = FN+' '+LN
                    this_authors.append((Name,row[contact_col]))

                for aut in this_authors:                
                    AuthorName = aut[0]
                    AuthorContact = aut[1]
                    print ("\tProcessing author ... ", AuthorName, '(', AuthorContact,')', end = ' ')
                    # verify if this author already exists in the authors data structure
                    author_exists = False
                    for author in authors:
                        if author['name']==AuthorName:
                            author_exists = True
                            AuthorKey = author['ID']
                            print ("already in authors")
                            break
                    if not author_exists:  # append author to authors
                        AuthorKey = guid (AuthorName)
                        aut_dict = {'ID':AuthorKey, 'name':AuthorName,
                                   'email':AuthorContact, 'state':'loaded'}
                        authors.append(aut_dict)
                        authors_modified = True
                        print ("added to authors")
                    
                    # add to the art_aut table
                    # firstly check whether it already exists
                    art_aut_exists = False
                    for this_art_aut in art_aut:
                        if this_art_aut[0]==ArticleKey and this_art_aut[1]==AuthorKey:
                            art_aut_exists = True
                            break
                    if not art_aut_exists:  # add
                        art_aut.append((ArticleKey,AuthorKey))
                        art_aut_modified = True

                    # Forms
                    # there will only be a form for the current author
                    # if its email exists
                    if (AuthorContact.find('@') != -1):
                        # verify whether there is already a form for this author
                        form_exists = False
                        print ("\tProcessing form for author ", AuthorName, end = ' ...')
                        for form in forms:
                            if form['ID']==AuthorKey:
                                form_exists=True
                                form['articles'].append(ArticleKey)
                                forms_modified = True
                                print ("already in forms. Add article ", ArticleKey)
                        if not form_exists:
                            form_dict = {'ID':AuthorKey, 'articles':[ArticleKey],
                                        'file':AuthorName+'-form.pdf',
                                        'state':'loaded'}
                            forms.append(form_dict)
                            forms_modified = True
                            print ("added to forms. Add article ", ArticleKey)
                # end authors loop 
                print ()
                    
            # end loop through csv rows
        # end opening csv_in
    except FileNotFoundError:
        print ('File {0} not found!'.format(csv_fileN))
        continue
        
    print ('*** Finished file {0} ***\n'.format(csv_file))
    # end loop through csv files


*** Processing file 2012EPCG ***
Processing article ...  2012_0006 added to articles
	Processing author ...  Abílio Costa ( amfcalt@gmail.com ) added to authors
	Processing form for author  Abílio Costa ...added to forms. Add article  2012_0006
	Processing author ...  João P. Pereira ( jjp@isep.ipp.pt  ) added to authors
	Processing form for author  João P. Pereira ...added to forms. Add article  2012_0006

Processing article ...  2012_0007 added to articles
	Processing author ...  Pedro Leitão ( pedromiguel_rs70@hotmail.com  ) added to authors
	Processing form for author  Pedro Leitão ...added to forms. Add article  2012_0007
	Processing author ...  António Castro ( avc@isep.ipp.pt  ) added to authors
	Processing form for author  António Castro ...added to forms. Add article  2012_0007
	Processing author ...  João J. Pereira ( jjp@isep.ipp.pt ) added to authors
	Processing form for author  João J. Pereira ...added to forms. Add article  2012_0007

Processing article ...  2012_0008 add

	Processing form for author  R. Bidarra ...added to forms. Add article  2011_0034

Processing article ...  2011_0035 added to articles
	Processing author ...  Eduardo Roa ( eduroam@gmail.com ) added to authors
	Processing form for author  Eduardo Roa ...added to forms. Add article  2011_0035
	Processing author ...  Víctor Theoktisto (  ) added to authors
	Processing author ...  Marta Fairén ( mfairen@lsi.upc.edu ) added to authors
	Processing form for author  Marta Fairén ...added to forms. Add article  2011_0035
	Processing author ...  Isabel Navazo ( isabel@lsi.upc.edu ) added to authors
	Processing form for author  Isabel Navazo ...added to forms. Add article  2011_0035

Processing article ...  2011_0036 added to articles
	Processing author ...  Marcos Novalbos (  ) added to authors
	Processing author ...  Alberto Sánchez ( alberto.sanchez@urjc.es ) added to authors
	Processing form for author  Alberto Sánchez ...added to forms. Add article  2011_0036

Processing article ...  2011_0

	Processing form for author  Miguel Sales Dias ...already in forms. Add article  2010_0061

Processing article ...  2010_0062 added to articles
	Processing author ...  Luís Miguel S Ponciano ( lmspo@iscte.pt  ) added to authors
	Processing form for author  Luís Miguel S Ponciano ...added to forms. Add article  2010_0062
	Processing author ...  Miguel Sales Dias ( miguel.dias@microsoft.com ) already in authors
	Processing form for author  Miguel Sales Dias ...already in forms. Add article  2010_0062

Processing article ...  2010_0063 added to articles
	Processing author ...  João Benedito ( jpmlb@ist.utl.pt.com ) added to authors
	Processing form for author  João Benedito ...added to forms. Add article  2010_0063
	Processing author ...  Tiago Guerreiro ( tjvg@vimmi.inesc-id.pt ) already in authors
	Processing form for author  Tiago Guerreiro ...already in forms. Add article  2010_0063
	Processing author ...  Hugo Nicolau ( hman@vimmi.inesc-id.pt ) already in authors
	Processing form for

	Processing author ...  Mauro Figueiredo ( mfiguei@ualg.pt ) added to authors
	Processing form for author  Mauro Figueiredo ...added to forms. Add article  2009_0038

Processing article ...  2009_0039 added to articles
	Processing author ...  Hugo Aguiar ( hugo.aguiar@gmail.com ) added to authors
	Processing form for author  Hugo Aguiar ...added to forms. Add article  2009_0039
	Processing author ...  Paulo Pombinho ( ppombinho@lasige.di.fc.ul.pt ) already in authors
	Processing form for author  Paulo Pombinho ...already in forms. Add article  2009_0039
	Processing author ...  Ana Paula Afonso ( apa@di.fc.ul.pt ) already in authors
	Processing form for author  Ana Paula Afonso ...already in forms. Add article  2009_0039

Processing article ...  2009_0040 added to articles
	Processing author ...  Carlos Urbano ( curbano@estg.ipleiria.pt  ) added to authors
	Processing form for author  Carlos Urbano ...added to forms. Add article  2009_0040
	Processing author ...  Luís Magalhães ( lmagal

	Processing author ...  Ricardo Abreu Lopes ( rval@ist.utl.pt ) added to authors
	Processing form for author  Ricardo Abreu Lopes ...added to forms. Add article  2008_0039
	Processing author ...  Manuel J. Fonseca ( mjf@inesc-id.pt ) already in authors
	Processing form for author  Manuel J. Fonseca ...already in forms. Add article  2008_0039

Processing article ...  2008_0040 added to articles
	Processing author ...  Duarte Gonçalves ( dnjg@di.fct.unl.pt ) added to authors
	Processing form for author  Duarte Gonçalves ...added to forms. Add article  2008_0040
	Processing author ...  Rui Jesus ( rjesus@deetc.isel.ipl.pt ) added to authors
	Processing form for author  Rui Jesus ...added to forms. Add article  2008_0040
	Processing author ...  Filipe Grangeiro ( isel.ipl.pt ) added to authors
	Processing author ...  Nuno Correia ( nmc@di.fct.unl.pt ) already in authors
	Processing form for author  Nuno Correia ...already in forms. Add article  2008_0040

Processing article ...  2008_0041 

In [6]:
# pickle dump the data structures

if events_modified:
    with open(pkl_path+events_fn,"wb") as events_out:
        pickle.dump(events, events_out)

if articles_modified:
    with open(pkl_path+articles_fn,"wb") as articles_out:
        pickle.dump(articles, articles_out)
        
if authors_modified:
    with open(pkl_path+authors_fn,"wb") as authors_out:
        pickle.dump(authors, authors_out)

if art_aut_modified:
    with open(pkl_path+art_aut_fn,"wb") as art_aut_out:
        pickle.dump(art_aut, art_aut_out)

if forms_modified:
    with open(pkl_path+forms_fn,"wb") as forms_out:
        pickle.dump(forms, forms_out)

        

In [7]:
print ('That\'s all, folks')

That's all, folks
