In [1]:
import pandas as pd
from os import getcwd
from os.path import join, abspath, pardir
import re
from random import choice

##### Configs

In [2]:
parent_dir = abspath(join(join(getcwd(), pardir), pardir))
data_dir = join(parent_dir, "data", "raw")
data_file = join(data_dir, "publications.csv")

##### Load data

In [3]:
df = pd.read_csv(data_file)
df.head(3)

Unnamed: 0,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,Affiliations,Authors with affiliations,Abstract,Author Keywords,Index Keywords,Document Type,Publication Stage,Access Type,Source,EID
0,"Gautam A., Crandall J.W., Goodrich M.A.",57218202833;7004904337;7005513246;,Self-assessment of Proficiency of Intelligent ...,2021,Advances in Intelligent Systems and Computing,1210 AISC,,,108.0,113.0,...,"Computer Science Department, Brigham Young Uni...","Gautam, A., Computer Science Department, Brigh...","Autonomous systems, although capable of perfor...",Environment; Goal(s); Intelligent agents; Prof...,Drones; Human engineering; Intelligent systems...,Conference Paper,Final,,Scopus,2-s2.0-85088238482
1,"de Moura Oliveira P.B., Hedengren J.D., Boaven...",6508306234;9277159100;6507358470;,Bridging theory to practice: Feedforward and c...,2021,Lecture Notes in Electrical Engineering,695 LNEE,,,23.0,32.0,...,"INESC-TEC Technology and Science, Campus da FE...","de Moura Oliveira, P.B., INESC-TEC Technology ...",Practice is of the essence in Engineering cour...,,Automation; Cascade control systems; Computati...,Conference Paper,Final,,Scopus,2-s2.0-85091306533
2,"Hajimirzaie S.M., Hotchkiss R.H.",53879700900;26642910500;,Development of sediment management guidelines ...,2020,Journal of Hydraulic Engineering,146,12.0,2520004.0,,,...,"Task Committee Secretary and Lead Engineer, Op...","Hajimirzaie, S.M., Task Committee Secretary an...",Forum papers are thought-provoking opinion pie...,,Hydraulics; Editorial board; Sediment manageme...,Review,Final,,Scopus,2-s2.0-85091917202


In [4]:
df.columns.to_list()

['Authors',
 'Author(s) ID',
 'Title',
 'Year',
 'Source title',
 'Volume',
 'Issue',
 'Art. No.',
 'Page start',
 'Page end',
 'Page count',
 'Cited by',
 'DOI',
 'Link',
 'Affiliations',
 'Authors with affiliations',
 'Abstract',
 'Author Keywords',
 'Index Keywords',
 'Document Type',
 'Publication Stage',
 'Access Type',
 'Source',
 'EID']

##### Define all the columns needed

In [5]:
__cols__ = ['Title', 'Source title', 'Authors', 'Index Keywords', 'Document Type', 'Volume']
df = df[__cols__]

##### Helper methods

In [6]:
def cast_columns(df, cols, __type):
    df[cols] = df[cols].astype(__type)
    return df
def flatten(t):
    return [item for sublist in t for item in sublist]

def split_str_and_concat(df, col_name, sep="; "):
    return pd.DataFrame(df[col_name].str.split(sep).tolist(), index=keywords_df.index).stack()

def filter_countries(df):
    """
    Not perfect but filter out almost 99.9% countries for our dataset
    """
    # Find "(" or ")" or any digit
    regex = re.compile(r"(\(|\))|(\@)|(\d+)+", re.S)
    def _filter(regex, x):
        return None if regex.search(x) else x
    return pd.Series([_filter(regex, str(x)) for x in df['country']])

def filter_department(df):
    """
    Not perfect solution but good for more than 85%-90% cases. 
    """
    def _filter(a):
        aa = [x for x in a.split(",") if str(x).find("Dep") != -1 or str(x).find("School") != -1]
        return aa[0] if len(aa) else None
    return pd.Series([_filter(str(x)) for x in df['name']])

In [7]:
df.rename(columns={'Title': 'title', 'Source title': 'venue', 'Volume': 'publication', 'Authors':'author', 'Index Keywords': 'areas', 'Document Type':'document_type'}, inplace=True)
df.head()

Unnamed: 0,title,venue,author,areas,document_type,publication
0,Self-assessment of Proficiency of Intelligent ...,Advances in Intelligent Systems and Computing,"Gautam A., Crandall J.W., Goodrich M.A.",Drones; Human engineering; Intelligent systems...,Conference Paper,1210 AISC
1,Bridging theory to practice: Feedforward and c...,Lecture Notes in Electrical Engineering,"de Moura Oliveira P.B., Hedengren J.D., Boaven...",Automation; Cascade control systems; Computati...,Conference Paper,695 LNEE
2,Development of sediment management guidelines ...,Journal of Hydraulic Engineering,"Hajimirzaie S.M., Hotchkiss R.H.",Hydraulics; Editorial board; Sediment manageme...,Review,146
3,Structural design space exploration using prin...,Journal of Computing and Information Science i...,"Bunnell S., Gorrell S., Salmon J., Thelin C., ...",Compressors; Data handling; Structural design;...,Article,20
4,Religion-focused dating apps: A Q methodology ...,Telematics and Informatics,"Richardson M., Cannon S., Teichert L., Vance A...",Entertainment; A-RINGS; External pressures; Ro...,Article,55


In [69]:
author_df = pd.DataFrame(df['author'].str.split(', ').tolist(), index=df.index).stack().reset_index().set_index('level_0')
author_df.drop('level_1', axis=1, inplace=True)
authors = list(author_df[0].drop_duplicates())

In [70]:
choice(authors)

'Pack A.T.'

In [82]:
# area_df = 
# pd.DataFrame(df['areas'].str.split('; ').tolist(), index=df.index).stack()
# area_df.drop('level_1', axis=1, inplace=True)
# areas = list(area_df[0].drop_duplicates())

df['areas'].str.split('; ').to_list()

[['Drones',
  'Human engineering',
  'Intelligent systems',
  'Real time systems',
  'Security of data',
  'Autonomous systems',
  'Causal relationships',
  'Decision mechanism',
  'Dependency graphs',
  'Real-time application',
  'Self assessment',
  'Social robots'],
 ['Automation',
  'Cascade control systems',
  'Computation theory',
  'Process control',
  'Soft computing',
  'Students',
  'Cascade control',
  'Cascade control structure',
  'Control concept',
  'Control techniques',
  'Engineering course',
  'Proportional , integral and derivative controllers',
  'Teaching/learning',
  'Theory and practice',
  'Control theory'],
 ['Hydraulics',
  'Editorial board',
  'Sediment management',
  'Reservoir management',
  'guideline',
  'reservoir',
  'sediment'],
 ['Compressors',
  'Data handling',
  'Structural design',
  'Compressor blades',
  'Computational costs',
  'Design space exploration',
  'Design variations',
  'Geometric variations',
  'Principal Components',
  'Stress varia