In [1]:
import csv
import pickle
import re  # string processing


In [2]:
# csv files
csv_path = 'csv\\'

csv_files = ['2003EPCG','2004Interacao','2005EPCG','2006Interacao','2007EPCG']
#csv_files = ['2008Interacao']
csv_extension = '.csv'



In [3]:
# pkl files and data structures
pkl_path = 'pkl\\'

# load the events data structure (if it exists)
events_fn = 'events.pkl'
events_modified = False

# verify if the file exists
try:
    with open(pkl_path+events_fn,"rb") as events_in:
        events = pickle.load(events_in)
except FileNotFoundError:
    events = []
print ('There are {0} events!'.format(len(events)))
#print (events)

# load the articles data structure (if it exists)
articles_fn = 'articles.pkl'
articles_modified = False

# verify if the file exists
try:
    with open(pkl_path+articles_fn,"rb") as articles_in:
        articles = pickle.load(articles_in)
except FileNotFoundError:
    articles = []
print ('There are {0} articles!'.format(len(articles)))
#print (articles)

# load the authors data structure (if it exists)
authors_fn = 'authors.pkl'
authors_modified = False

# verify if the file exists
try:
    with open(pkl_path+authors_fn,"rb") as authors_in:
        authors = pickle.load(authors_in)
except FileNotFoundError:
    authors = []
print ('There are {0} authors!'.format(len(authors)))
#print (authors)

# load the articles/authors data structure (if it exists)
art_aut_fn = 'art_aut.pkl'
art_aut_modified = False

# verify if the file exists
try:
    with open(pkl_path+art_aut_fn,"rb") as art_aut_in:
        art_aut = pickle.load(art_aut_in)
except FileNotFoundError:
    art_aut = []
print ('There are {0} articles-authors relationships!'.format(len(art_aut)))
#print (art_aut)

# load the forms data structure (if it exists)
forms_fn = 'forms.pkl'
forms_modified = False

# verify if the file exists
try:
    with open(pkl_path+forms_fn,"rb") as forms_in:
        forms = pickle.load(forms_in)
except FileNotFoundError:
    forms = []
print ('There are {0} forms!'.format(len(forms)))
#print (art_aut)


There are 0 events!
There are 0 articles!
There are 0 authors!
There are 0 articles-authors relationships!
There are 0 forms!


In [4]:
import time
import random
import hashlib
import unicodedata

# Use this function to generate a unique ID for each author
# use name as arg to generate more randomness

def guid( arg ):
    """
    Generates a universally unique ID.
    Any arguments only create more randomness.
    """
    t = int( time.time() * 1000 )
    r = int( random.random()*10000000 )
    data = str(t)+' '+str(r)+' '+arg
    data = unicodedata.normalize('NFKD', data).encode('ascii', 'ignore')
    #print (data, flush=True)
    data = hashlib.md5(data).hexdigest()

    return data

In [5]:
# main loop
for csv_file in csv_files:
    print ('*** Processing file {0} ***'.format(csv_file))
    
    csv_fileN = csv_path + csv_file + csv_extension    
    # read the csv file
    try:
        with open(csv_fileN,'rt') as csv_in:
            
            # Loop remaining rows
            for rownum, row in enumerate(csv.reader(csv_in, delimiter=";")):

                # for the first row get indexes of the required fields
                if rownum==0:
                    # event
                    Acronym_col = row.index ('Acronym')
                    Year_col = row.index ('Year')
                    Event_col = row.index ('Event')
                    # article
                    Type_col = row.index ('Type')
                    Title_col = row.index ('Title')
                    File_col = row.index ('File')
                    ArticleKey_col = row.index ('ArticleKey')
                    # authors col indexes
                    authors_col = []
                    au_n = 1
                    finish = False
                    while not finish:
                        LN = 'Author' + str(au_n) + 'LastName'
                        FN = 'Author' + str(au_n) + 'FirstName'
                        Contact = 'Author' + str(au_n) + 'Contact'
                        try:
                            LN_col = row.index (LN)
                            FN_col = row.index (FN)
                            contact_col = row.index (Contact)
                            authors_col.append((LN_col, FN_col, contact_col))
                            au_n += 1
                        except ValueError:
                            finish = True
                    continue
            
                # for the first data row verify the event
                if rownum==1:
                    Acronym = row[Acronym_col]
                    # verify if this event already exists in the events data structure
                    event_exists = False
                    for event in events:
                        if event['acr']==Acronym:
                            event_exists = True
                            break
                    if not event_exists:  # append event to events
                        event_dict = {'acr':Acronym, 'name':row[Event_col],
                                      'year':row[Year_col], 'state': 'loaded'}
                        events.append(event_dict)
                        events_modified = True
                        
                # for the current row (article)
                if row[Type_col]!='Article':
                    continue
                # article data
                ArticleKey = row[ArticleKey_col]
                print ("Processing article ... ", ArticleKey, end = ' ')
                # verify if this article already exists in the articles data structure
                article_exists = False
                for article in articles:
                    if article['ID']==ArticleKey:
                        article_exists = True
                        break
                if not article_exists:  # append event to articles
                    Acronym = row[Acronym_col]
                    Title = row[Title_col]
                    File = row[File_col]
                    art_dict = {'ID':ArticleKey, 'event':Acronym,
                           'title':Title, 'file':File, 'state':'loaded'}
                    articles.append(art_dict)
                    articles_modified = True
                    print ("added to articles")
                else:
                    print ("already in articles")
                # authors data
                this_authors = []
                for au in authors_col:
                    LN_col = au[0]
                    FN_col = au[1]
                    contact_col = au[2]
                    if row[LN_col]=="": break  # last author was previous
                    LN = row[LN_col]
                    LN = LN.strip()
                    LN = re.sub("\s\s+", " ", LN)
                    FN = row[FN_col]
                    FN = FN.strip()
                    FN = re.sub("\s\s+", " ", FN)
                    Name = FN+' '+LN
                    this_authors.append((Name,row[contact_col]))

                for aut in this_authors:                
                    AuthorName = aut[0]
                    AuthorContact = aut[1]
                    print ("\tProcessing author ... ", AuthorName, '(', AuthorContact,')', end = ' ')
                    # verify if this author already exists in the authors data structure
                    author_exists = False
                    for author in authors:
                        if author['name']==AuthorName:
                            author_exists = True
                            AuthorKey = author['ID']
                            print ("already in authors")
                            break
                    if not author_exists:  # append author to authors
                        AuthorKey = guid (AuthorName)
                        aut_dict = {'ID':AuthorKey, 'name':AuthorName,
                                   'email':AuthorContact, 'state':'loaded'}
                        authors.append(aut_dict)
                        authors_modified = True
                        print ("added to authors")
                    
                    # add to the art_aut table
                    # firstly check whether it already exists
                    art_aut_exists = False
                    for this_art_aut in art_aut:
                        if this_art_aut[0]==ArticleKey and this_art_aut[1]==AuthorKey:
                            art_aut_exists = True
                            break
                    if not art_aut_exists:  # add
                        art_aut.append((ArticleKey,AuthorKey))
                        art_aut_modified = True

                    # Forms
                    # there will only be a form for the current author
                    # if its email exists
                    if (AuthorContact.find('@') != -1):
                        # verify whether there is already a form for this author
                        form_exists = False
                        print ("\tProcessing form for author ", AuthorName, end = ' ...')
                        for form in forms:
                            if form['ID']==AuthorKey:
                                form_exists=True
                                form['articles'].append(ArticleKey)
                                forms_modified = True
                                print ("already in forms. Add article ", ArticleKey)
                        if not form_exists:
                            form_dict = {'ID':AuthorKey, 'articles':[ArticleKey],
                                        'file':AuthorName+'-form.pdf',
                                        'state':'loaded'}
                            forms.append(form_dict)
                            forms_modified = True
                            print ("added to forms. Add article ", ArticleKey)
                # end authors loop 
                print ()
                    
            # end loop through csv rows
        # end opening csv_in
    except FileNotFoundError:
        print ('File {0} not found!'.format(csv_fileN))
        continue
        
    print ('*** Finished file {0} ***\n'.format(csv_file))
    # end loop through csv files


*** Processing file 2003EPCG ***
Processing article ...  2003_0006 added to articles
	Processing author ...  Frutuoso G. M. Silva ( fsilva@di.ubi.pt  ) added to authors
	Processing form for author  Frutuoso G. M. Silva ...added to forms. Add article  2003_0006
	Processing author ...  Abel J. P. Gomes ( agomes@di.ubi.pt ) added to authors
	Processing form for author  Abel J. P. Gomes ...added to forms. Add article  2003_0006

Processing article ...  2003_0007 added to articles
	Processing author ...  Francisco Morgado ( fmorgado@di.estv.ipv.pt ) added to authors
	Processing form for author  Francisco Morgado ...added to forms. Add article  2003_0007
	Processing author ...  Abel Gomes ( agomes@di.ubi.pt ) added to authors
	Processing form for author  Abel Gomes ...added to forms. Add article  2003_0007

Processing article ...  2003_0008 added to articles
	Processing author ...  Bruno Rodrigues Araújo ( brar@immi.inesc.pt ) added to authors
	Processing form for author  Bruno Rodrigues Ara

	Processing form for author  Anabela Simões ...added to forms. Add article  2004_0016
	Processing author ...  José Carvalhais ( jcarvalhais@fmh.utl.pt  ) added to authors
	Processing form for author  José Carvalhais ...added to forms. Add article  2004_0016
	Processing author ...  Ana Gomes ( anagomes37@hotmail.com  ) added to authors
	Processing form for author  Ana Gomes ...added to forms. Add article  2004_0016

Processing article ...  2004_0017 added to articles
	Processing author ...  Francisco dos S. Rebelo ( frebelo@frnh.utl.pt  ) added to authors
	Processing form for author  Francisco dos S. Rebelo ...added to forms. Add article  2004_0017
	Processing author ...  Ernesto Vilar Filgueiras ( ernestovilar@uol.corn.br ) added to authors
	Processing form for author  Ernesto Vilar Filgueiras ...added to forms. Add article  2004_0017
	Processing author ...  Raquel J. H. R Santos ( rsantos@frnh.utl.pt  ) added to authors
	Processing form for author  Raquel J. H. R Santos ...added to fo

	Processing form for author  Joana Pereira ...added to forms. Add article  2004_0034
	Processing author ...  Carriço Luís (   lmc@di.fo.ui.pt ) added to authors
	Processing form for author  Carriço Luís ...added to forms. Add article  2004_0034

Processing article ...  2004_0036 added to articles
	Processing author ...  Daniel Gonçalves ( djvg@gia.ist.utl.pt ) already in authors
	Processing form for author  Daniel Gonçalves ...already in forms. Add article  2004_0036
	Processing author ...  Joaquim A. Jorge ( jorgej@acm.org  ) added to authors
	Processing form for author  Joaquim A. Jorge ...added to forms. Add article  2004_0036

Processing article ...  2004_0037 added to articles
	Processing author ...  Cristina Gouveia ( gouveia@alum.mit.edu ) added to authors
	Processing form for author  Cristina Gouveia ...added to forms. Add article  2004_0037
	Processing author ...  Maria João Silva ( mjosilva@e se. i pp.pt  ) added to authors
	Processing form for author  Maria João Silva ...add

	Processing form for author  José Nunes ...added to forms. Add article  2004_0051
	Processing author ...  Leonor Teixeira (  lteixeira@egi.ua.pt  ) added to authors
	Processing form for author  Leonor Teixeira ...added to forms. Add article  2004_0051
	Processing author ...  Óscar Mealha ( oern@ca.ua.pt  ) added to authors
	Processing form for author  Óscar Mealha ...added to forms. Add article  2004_0051
	Processing author ...  Beatriz Sousa Santos ( bss@ieeta.pt  ) already in authors
	Processing form for author  Beatriz Sousa Santos ...already in forms. Add article  2004_0051

Processing article ...  2004_0052 added to articles
	Processing author ...  Hugo Miguel Gonçalves Rego ( hugo_rego@esav.ipv.pt ) added to authors
	Processing form for author  Hugo Miguel Gonçalves Rego ...added to forms. Add article  2004_0052
	Processing author ...  Tiago Henrique R.S. L. Moreira (  tiagorn@di.estv.ipv.pt ) added to authors
	Processing form for author  Tiago Henrique R.S. L. Moreira ...added t

	Processing author ...  Próspero Santos ( ps@di.fct.unl.pt ) added to authors
	Processing form for author  Próspero Santos ...added to forms. Add article  2005_0016
	Processing author ...  Gil Gonçalves ( gil@mat.uc.pt ) added to authors
	Processing form for author  Gil Gonçalves ...added to forms. Add article  2005_0016

Processing article ...  2005_0017 added to articles
	Processing author ...  Samuel Silva ( sss@ieeta.pt ) already in authors
	Processing form for author  Samuel Silva ...already in forms. Add article  2005_0017
	Processing author ...  Joaquim Madeira ( jmadeira@det.ua.pt ) already in authors
	Processing form for author  Joaquim Madeira ...already in forms. Add article  2005_0017
	Processing author ...  Carlos Ferreira ( carlosf@egi.ua.pt ) added to authors
	Processing form for author  Carlos Ferreira ...added to forms. Add article  2005_0017
	Processing author ...  Beatriz Sousa Santos ( bss@det.ua.pt ) already in authors
	Processing form for author  Beatriz Sousa San

	Processing author ...  Tiago Guerreiro ( tjvg@immi.inesc-id.pt ) added to authors
	Processing form for author  Tiago Guerreiro ...added to forms. Add article  2005_0031
	Processing author ...  Ricardo Jota ( rjc@immi.inesc-id.pt ) added to authors
	Processing form for author  Ricardo Jota ...added to forms. Add article  2005_0031
	Processing author ...  Joaquim A. Jorge ( jorgej@acm.org ) already in authors
	Processing form for author  Joaquim A. Jorge ...already in forms. Add article  2005_0031
	Processing author ...  João M. Pereira (  ) added to authors

Processing article ...  2005_0032 added to articles
	Processing author ...  Joaquim A. Jorge ( jaj@inesc-id.pt ) already in authors
	Processing form for author  Joaquim A. Jorge ...already in forms. Add article  2005_0032
	Processing author ...  Manuel J. Fonseca ( mjf@inesc-id.pt  ) already in authors
	Processing form for author  Manuel J. Fonseca ...already in forms. Add article  2005_0032
	Processing author ...  Filipe M Garcia 

	Processing author ...  Luís Carriço ( lmc@di.fc.ul.pt ) already in authors
	Processing form for author  Luís Carriço ...already in forms. Add article  2006_0018
	Processing author ...  Luís Filipe Duarte ( lduarte@lasige.di.fc.ul.pt ) added to authors
	Processing form for author  Luís Filipe Duarte ...added to forms. Add article  2006_0018
	Processing author ...  David Cruz ( dcruz@lasige.di.fc.ul.pt ) added to authors
	Processing form for author  David Cruz ...added to forms. Add article  2006_0018
	Processing author ...  Cátia Torres ( ctorres@lasige.di.fc.ul.pt ) added to authors
	Processing form for author  Cátia Torres ...added to forms. Add article  2006_0018

Processing article ...  2006_0019 added to articles
	Processing author ...  Luís Carriço ( lmc@di.fc.ul.pt ) already in authors
	Processing form for author  Luís Carriço ...already in forms. Add article  2006_0019
	Processing author ...  Marco Sá (  marcosa@di.fc.ul.pt ) already in authors
	Processing form for author  Marc

	Processing author ...  Pedro Semião ( i21197@alunos.di.fc.ul.pt ) added to authors
	Processing form for author  Pedro Semião ...added to forms. Add article  2006_0032
	Processing author ...  Maria Beatriz ( bc@di.fc.ul.pt  ) added to authors
	Processing form for author  Maria Beatriz ...added to forms. Add article  2006_0032
	Processing author ...  Ana Paula Cláudio (  apc@di.fc.ul.pt  ) added to authors
	Processing form for author  Ana Paula Cláudio ...added to forms. Add article  2006_0032

Processing article ...  2006_0033 added to articles
	Processing author ...  Janete Faustino ( e25526@alunos.di.fc.ul.pt ) added to authors
	Processing form for author  Janete Faustino ...added to forms. Add article  2006_0033
	Processing author ...  Ana Paula Cláudio (  apc@di.fc.ul.pt  ) already in authors
	Processing form for author  Ana Paula Cláudio ...already in forms. Add article  2006_0033
	Processing author ...  Maria Beatriz ( bc@di.fc.ul.pt  ) already in authors
	Processing form for aut

	Processing form for author  Adérito F. Marcos ...added to forms. Add article  2006_0046

Processing article ...  2006_0047 added to articles
	Processing author ...  Samuel Silva (  sss@ieeta.pt ) already in authors
	Processing form for author  Samuel Silva ...already in forms. Add article  2006_0047
	Processing author ...  Carlos Ferreira ( cferreira@egi.ua.pt ) already in authors
	Processing form for author  Carlos Ferreira ...already in forms. Add article  2006_0047
	Processing author ...  Joaquim Madeira ( jmadeira@det.ua.pt ) already in authors
	Processing form for author  Joaquim Madeira ...already in forms. Add article  2006_0047
	Processing author ...  Beatriz Sousa Santos ( bss@det.ua.pt ) already in authors
	Processing form for author  Beatriz Sousa Santos ...already in forms. Add article  2006_0047

Processing article ...  2006_0048 added to articles
	Processing author ...  Helder Pinto ( helder@dsi.uminho.pt ) added to authors
	Processing form for author  Helder Pinto ...ad

	Processing author ...  João Luís Sobral ( jls@di .uminho .pt  ) added to authors
	Processing form for author  João Luís Sobral ...added to forms. Add article  2007_0013

Processing article ...  2007_0014 added to articles
	Processing author ...  R. Bastos (  ) added to authors
	Processing author ...  M.S. Dias (  ) added to authors

Processing article ...  2007_0015 added to articles
	Processing author ...  Vasco Gervásio ( vmrg@immi.inesc-id.pt ) added to authors
	Processing form for author  Vasco Gervásio ...added to forms. Add article  2007_0015
	Processing author ...  Joaquim A. Jorge ( jorgej@acm.org  ) already in authors
	Processing form for author  Joaquim A. Jorge ...already in forms. Add article  2007_0015

Processing article ...  2007_0017 added to articles
	Processing author ...  Samuel Silva ( sss@ua.pt  ) already in authors
	Processing form for author  Samuel Silva ...already in forms. Add article  2007_0017
	Processing author ...  Joaquim Madeira ( jmadeira@ua.pt  ) alre

In [6]:
# pickle dump the data structures

if events_modified:
    with open(pkl_path+events_fn,"wb") as events_out:
        pickle.dump(events, events_out)

if articles_modified:
    with open(pkl_path+articles_fn,"wb") as articles_out:
        pickle.dump(articles, articles_out)
        
if authors_modified:
    with open(pkl_path+authors_fn,"wb") as authors_out:
        pickle.dump(authors, authors_out)

if art_aut_modified:
    with open(pkl_path+art_aut_fn,"wb") as art_aut_out:
        pickle.dump(art_aut, art_aut_out)

if forms_modified:
    with open(pkl_path+forms_fn,"wb") as forms_out:
        pickle.dump(forms, forms_out)

        

In [None]:
print ('That\'s all, folks')