In [1]:
import pandas as pd
from os import getcwd
from os.path import join, abspath, pardir
import re
from random import choice

##### Configs

In [2]:
parent_dir = abspath(join(join(getcwd(), pardir), pardir))
data_dir = join(parent_dir, "data", "raw")
data_file = join(data_dir, "publications.csv")

##### Load data

In [3]:
df = pd.read_csv(data_file)
df.head(3)

Unnamed: 0,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,Affiliations,Authors with affiliations,Abstract,Author Keywords,Index Keywords,Document Type,Publication Stage,Access Type,Source,EID
0,"Gautam A., Crandall J.W., Goodrich M.A.",57218202833;7004904337;7005513246;,Self-assessment of Proficiency of Intelligent ...,2021,Advances in Intelligent Systems and Computing,1210 AISC,,,108.0,113.0,...,"Computer Science Department, Brigham Young Uni...","Gautam, A., Computer Science Department, Brigh...","Autonomous systems, although capable of perfor...",Environment; Goal(s); Intelligent agents; Prof...,Drones; Human engineering; Intelligent systems...,Conference Paper,Final,,Scopus,2-s2.0-85088238482
1,"de Moura Oliveira P.B., Hedengren J.D., Boaven...",6508306234;9277159100;6507358470;,Bridging theory to practice: Feedforward and c...,2021,Lecture Notes in Electrical Engineering,695 LNEE,,,23.0,32.0,...,"INESC-TEC Technology and Science, Campus da FE...","de Moura Oliveira, P.B., INESC-TEC Technology ...",Practice is of the essence in Engineering cour...,,Automation; Cascade control systems; Computati...,Conference Paper,Final,,Scopus,2-s2.0-85091306533
2,"Hajimirzaie S.M., Hotchkiss R.H.",53879700900;26642910500;,Development of sediment management guidelines ...,2020,Journal of Hydraulic Engineering,146,12.0,2520004.0,,,...,"Task Committee Secretary and Lead Engineer, Op...","Hajimirzaie, S.M., Task Committee Secretary an...",Forum papers are thought-provoking opinion pie...,,Hydraulics; Editorial board; Sediment manageme...,Review,Final,,Scopus,2-s2.0-85091917202


##### Define all the columns needed

In [4]:
__cols__ = ['Title', 'Source title', 'Authors', 'Index Keywords', 'Document Type', 'Volume', 'Year', 'Abstract']
df = df[__cols__]

##### Helper methods

In [5]:
def get_data(df):
    
    __data__ = list()
    
    paper_types = ['Full Paper', 'Short Paper', 'Demo Paper']
    conference_types = ['Workshop', 'Symposium', 'Expert Group', 'Regular Conference']
    document_types = ['Conference', 'Journal']
    decision_types = ['Accepted', 'Rejected']
    
    author_df = pd.DataFrame(df['Authors'].str.split(', ').tolist(), index=df.index).stack().reset_index().set_index('level_0')
    author_df.drop('level_1', axis=1, inplace=True)
    persons = list(author_df[0].drop_duplicates())
    
    for index, row in df.iterrows():
        
        authors = row['Authors']
        paper = row['Title']
        areas = row['Index Keywords']
        source = row['Source title']
        volume = row['Volume']
        year = row['Year']
        abstract = row['Abstract']
        
        document_type = choice(document_types)
        isConference = document_type == 'Conference'

        paper_type = choice(list(paper_types + ['Poster'])) if isConference else choice(paper_types)
        conference_type = choice(conference_types) if isConference else None
        
        reviewer_1, reviewer_2, handler = str( choice( persons ) ), str( choice( persons ) ), str( choice( persons ) )
        
        areas = areas.replace("; ", ";") if isinstance(areas, str) else None
        decision = choice(decision_types)
        
        for i, author in enumerate( authors.split(", ") ):
            
            data = dict()
            
            data['no'] = "{}.{}".format(index, i)
            data['Author'] = str(author)
            data['Paper'] = str(paper)
            data['Paper_Type'] = str(paper_type)
            data['Conference_Type'] = str(conference_type)
            data['Year'] = str(year)
            data['Source'] = str(source)
            data['Publication'] = "{} {}".format(str(source), str(volume))
            data['Document_Type'] = str(document_type)
            data['Reviewer_1'] = reviewer_1
            data['Reviewer_2'] = reviewer_2
            data['Handler'] = handler
            data['Areas'] = str(areas)
            data['Reviewer_Decision'] = str(decision)
            data['Reviewer_Text'] = str(abstract)
            
            __data__.append(data)
            
    return __data__

In [6]:
data = get_data(df)
data_df = pd.DataFrame(data)

##### Save the result

In [7]:
data_df.to_csv(join(data_dir, "instances_data.csv"), index=False)