In [1]:
import pandas as pd
from os import getcwd
from os.path import join, abspath, pardir
import re

##### Configs

In [2]:
parent_dir = abspath(join(join(getcwd(), pardir), pardir))
data_dir = join(parent_dir, "data")
scripts_dir = join(parent_dir, "src", "scripts")
data_file = join(data_dir, "publications.csv")

##### Load data

In [3]:
df = pd.read_csv(data_file)
df.head(3)

Unnamed: 0,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,Affiliations,Authors with affiliations,Abstract,Author Keywords,Index Keywords,Document Type,Publication Stage,Access Type,Source,EID
0,"Gautam A., Crandall J.W., Goodrich M.A.",57218202833;7004904337;7005513246;,Self-assessment of Proficiency of Intelligent ...,2021,Advances in Intelligent Systems and Computing,1210 AISC,,,108.0,113.0,...,"Computer Science Department, Brigham Young Uni...","Gautam, A., Computer Science Department, Brigh...","Autonomous systems, although capable of perfor...",Environment; Goal(s); Intelligent agents; Prof...,Drones; Human engineering; Intelligent systems...,Conference Paper,Final,,Scopus,2-s2.0-85088238482
1,"de Moura Oliveira P.B., Hedengren J.D., Boaven...",6508306234;9277159100;6507358470;,Bridging theory to practice: Feedforward and c...,2021,Lecture Notes in Electrical Engineering,695 LNEE,,,23.0,32.0,...,"INESC-TEC Technology and Science, Campus da FE...","de Moura Oliveira, P.B., INESC-TEC Technology ...",Practice is of the essence in Engineering cour...,,Automation; Cascade control systems; Computati...,Conference Paper,Final,,Scopus,2-s2.0-85091306533
2,"Hajimirzaie S.M., Hotchkiss R.H.",53879700900;26642910500;,Development of sediment management guidelines ...,2020,Journal of Hydraulic Engineering,146,12.0,2520004.0,,,...,"Task Committee Secretary and Lead Engineer, Op...","Hajimirzaie, S.M., Task Committee Secretary an...",Forum papers are thought-provoking opinion pie...,,Hydraulics; Editorial board; Sediment manageme...,Review,Final,,Scopus,2-s2.0-85091917202


##### Define all the columns needed

In [4]:
journals_cols = ['Year', 'Source title', 'Volume']
keyword_cols = ['Author Keywords', 'Index Keywords']
affiliation_cols = ['Affiliations']
authors_cols = ['Authors', 'Author(s) ID', 'Affiliations', 'Authors with affiliations', 'Author Keywords']
document_cols = ['Author(s) ID', 'Title', 'Source title', 'Art. No.', 'Cited by', 'DOI', 'Abstract', 'Author Keywords', 'Index Keywords', 'Document Type']

##### Helper methods

In [180]:
def cast_columns(df, cols, __type):
    df[cols] = df[cols].astype(__type)
    return df
def flatten(t):
    return [item for sublist in t for item in sublist]

def split_str_and_concat(df, col_name, sep="; "):
    return pd.DataFrame(df[col_name].str.split(sep).tolist(), index=keywords_df.index).stack()

def filter_countries(df):
    """
    Not perfect but filter out almost 99.9% countries for our dataset
    """
    # Find "(" or ")" or any digit
    regex = re.compile(r"(\(|\))|(\@)|(\d+)+", re.S)
    def _filter(regex, x):
        return None if regex.search(x) else x
    return pd.Series([_filter(regex, str(x)) for x in df['country']])

def filter_department(df):
    """
    Not perfect solution but good for more than 85%-90% cases. 
    """
    def _filter(a):
        aa = [x for x in a.split(",") if str(x).find("Dep") != -1 or str(x).find("School") != -1]
        return aa[0] if len(aa) else None
    return pd.Series([_filter(str(x)) for x in df['name']])

`Journals`

In [6]:
journals_df = df[journals_cols]
journals_df.dtypes

Year             int64
Source title    object
Volume          object
dtype: object

In [7]:
journals_df.head()

Unnamed: 0,Year,Source title,Volume
0,2021,Advances in Intelligent Systems and Computing,1210 AISC
1,2021,Lecture Notes in Electrical Engineering,695 LNEE
2,2020,Journal of Hydraulic Engineering,146
3,2020,Journal of Computing and Information Science i...,20
4,2020,Telematics and Informatics,55


In [8]:
journals_df = cast_columns(journals_df, ['Source title', 'Volume'], pd.StringDtype())
journals_df.rename(columns={'Year': 'year', 'Source title': 'name', 'Volume': 'volume'}, inplace=True)
journals_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = df[cols].astype(__type)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  journals_df.rename(columns={'Year': 'year', 'Source title': 'name', 'Volume': 'volume'}, inplace=True)


year       int64
name      string
volume    string
dtype: object

In [9]:
journals_df.head()

Unnamed: 0,year,name,volume
0,2021,Advances in Intelligent Systems and Computing,1210 AISC
1,2021,Lecture Notes in Electrical Engineering,695 LNEE
2,2020,Journal of Hydraulic Engineering,146
3,2020,Journal of Computing and Information Science i...,20
4,2020,Telematics and Informatics,55


Which column has null values ??

In [10]:
journals_df.isnull().any()

year      False
name      False
volume     True
dtype: bool

See some samples of `volume` being null

In [13]:
journals_df[journals_df['volume'].isnull()].head()

Unnamed: 0,year,name,volume
22,2020,Proceedings - 2020 IEEE 21st International Con...,
50,2020,Proceedings - 2020 IEEE 34th International Par...,
95,2020,Journal of Composite Materials,
96,2020,Computing in Science and Engineering,
101,2020,International Journal of Robotics Research,


Save as `journals.csv` file

In [14]:
journals_df.to_csv(join(data_dir, "journals.csv"), index=False)

`Keywords`

In [15]:
keywords_df = df[keyword_cols]
keywords_df.dtypes

Author Keywords    object
Index Keywords     object
dtype: object

In [16]:
keywords_df = cast_columns(keywords_df, keyword_cols, pd.StringDtype())
keywords_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = df[cols].astype(__type)


Author Keywords    string
Index Keywords     string
dtype: object

In [17]:
keywords_df.head()

Unnamed: 0,Author Keywords,Index Keywords
0,Environment; Goal(s); Intelligent agents; Prof...,Drones; Human engineering; Intelligent systems...
1,,Automation; Cascade control systems; Computati...
2,,Hydraulics; Editorial board; Sediment manageme...
3,Computer-aided engineering; Data-driven engine...,Compressors; Data handling; Structural design;...
4,Dating; Emerging adults; Mobile phones; Uses a...,Entertainment; A-RINGS; External pressures; Ro...


In [19]:
keywords_df.dropna(inplace=True)
author_keywords = split_str_and_concat(keywords_df, 'Author Keywords')
index_keywords = split_str_and_concat(keywords_df, 'Index Keywords')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keywords_df.dropna(inplace=True)


In [20]:
all_keywords = author_keywords.append(index_keywords)
all_keywords.drop_duplicates(keep='first', inplace=True)

  all_keywords = author_keywords.append(index_keywords)


In [21]:
keywords_df = pd.DataFrame({ 'name': all_keywords })
keywords_df.head()

Unnamed: 0,Unnamed: 1,name
0,0,Environment
0,1,Goal(s)
0,2,Intelligent agents
0,3,Proficiency
0,4,Self-assessment


Save as `keywords.csv` file

In [22]:
keywords_df.to_csv(join(data_dir, "keywords.csv"), index=False)

`Affiliations`

In [57]:
affiliations_df = df[affiliation_cols]
affiliations_df.dtypes

Affiliations    object
dtype: object

In [58]:
affiliations_df = cast_columns(affiliations_df, affiliation_cols, pd.StringDtype())
affiliations_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = df[cols].astype(__type)


Affiliations    string
dtype: object

In [59]:
all_affiliations = pd.DataFrame(affiliations_df['Affiliations'].str.split("; ").tolist()).stack()
all_affiliations.drop_duplicates(keep='first', inplace=True)
affiliations_df = all_affiliations.to_frame(name='name').reset_index(drop=True)
affiliations_df['country'] = affiliations_df['name'].str.rsplit(',', n=2, expand=True)[2]
affiliations_df['country'] = filter_countries(affiliations_df)

In [60]:
affiliations_df['dept_name'] = filter_department(affiliations_df)

In [61]:
affiliations_df.head()

Unnamed: 0,name,country,dept_name
0,"Computer Science Department, Brigham Young Uni...",United States,Computer Science Department
1,"INESC-TEC Technology and Science, Campus da FE...",Portugal,
2,"Department of Engineering, University of Trás-...",Portugal,Department of Engineering
3,"Department of Chemical Engineering, Brigham Yo...",United States,Department of Chemical Engineering
4,"Task Committee Secretary and Lead Engineer, Op...",United States,


Save as `affiliations.csv` file

In [62]:
affiliations_df.to_csv(join(data_dir, "affiliations.csv"), index=False)

`Documents`

In [37]:
document_df = df[document_cols]

In [38]:
document_df.rename(columns=
              {'DOI':'doi', 
               'Author(s) ID': 'author_ids', 
               'Art. No.':'article_no', 
               'Title':'title',
               'Abstract':'abstract',
               'Author Keywords':'author_keywords',
               'Index Keywords':'index_keywords',
               'Document Type':'document_type',
               'Cited by':'cited_count',
               'Source title':'source_title'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document_df.rename(columns=


In [39]:
document_df.head(2)

Unnamed: 0,author_ids,title,source_title,article_no,cited_count,doi,abstract,author_keywords,index_keywords,document_type
0,57218202833;7004904337;7005513246;,Self-assessment of Proficiency of Intelligent ...,Advances in Intelligent Systems and Computing,,,10.1007/978-3-030-51758-8_15,"Autonomous systems, although capable of perfor...",Environment; Goal(s); Intelligent agents; Prof...,Drones; Human engineering; Intelligent systems...,Conference Paper
1,6508306234;9277159100;6507358470;,Bridging theory to practice: Feedforward and c...,Lecture Notes in Electrical Engineering,,,10.1007/978-3-030-58653-9_3,Practice is of the essence in Engineering cour...,,Automation; Cascade control systems; Computati...,Conference Paper


In [40]:
document_df['author_ids'] = document_df['author_ids'].str.split(";").apply(lambda x: list(x[0: len(x) - 1]))
document_df.columns.to_list()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document_df['author_ids'] = document_df['author_ids'].str.split(";").apply(lambda x: list(x[0: len(x) - 1]))


['author_ids',
 'title',
 'source_title',
 'article_no',
 'cited_count',
 'doi',
 'abstract',
 'author_keywords',
 'index_keywords',
 'document_type']

In [41]:
document_df['keywords'] = document_df['author_keywords'] + document_df['index_keywords']
document_df['keywords'] = document_df['keywords'].str.split(";")
document_df.drop(columns=['author_keywords', 'index_keywords'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document_df['keywords'] = document_df['author_keywords'] + document_df['index_keywords']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document_df['keywords'] = document_df['keywords'].str.split(";")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document_df.drop(columns=['author_keywords', 'index_keywords'], inplace=True)


In [77]:
document_df.head(2)

Unnamed: 0,author_ids,title,source_title,article_no,cited_count,doi,abstract,document_type,keywords
0,"[57218202833, 7004904337, 7005513246]",Self-assessment of Proficiency of Intelligent ...,Advances in Intelligent Systems and Computing,,,10.1007/978-3-030-51758-8_15,"Autonomous systems, although capable of perfor...",Conference Paper,"[Environment, Goal(s), Intelligent agents, ..."
1,"[6508306234, 9277159100, 6507358470]",Bridging theory to practice: Feedforward and c...,Lecture Notes in Electrical Engineering,,,10.1007/978-3-030-58653-9_3,Practice is of the essence in Engineering cour...,Conference Paper,


Since, we have NaN values for `doi`, so we are using index as an identifier

In [87]:
document_df.reset_index(inplace=True)
document_df.rename(columns={'index':'document_id'}, inplace=True)
document_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document_df.rename(columns={'index':'document_id'}, inplace=True)


Unnamed: 0,document_id,author_ids,title,source_title,article_no,cited_count,doi,abstract,document_type,keywords
0,0,"[57218202833, 7004904337, 7005513246]",Self-assessment of Proficiency of Intelligent ...,Advances in Intelligent Systems and Computing,,,10.1007/978-3-030-51758-8_15,"Autonomous systems, although capable of perfor...",Conference Paper,"[Environment, Goal(s), Intelligent agents, ..."
1,1,"[6508306234, 9277159100, 6507358470]",Bridging theory to practice: Feedforward and c...,Lecture Notes in Electrical Engineering,,,10.1007/978-3-030-58653-9_3,Practice is of the essence in Engineering cour...,Conference Paper,


Save as `documents.csv` file

In [89]:
document_df.to_csv(join(data_dir, "documents.csv"), index=False)

`Authors`

In [45]:
authors_df = df[authors_cols]
authors_df.head()

Unnamed: 0,Authors,Author(s) ID,Affiliations,Authors with affiliations,Author Keywords
0,"Gautam A., Crandall J.W., Goodrich M.A.",57218202833;7004904337;7005513246;,"Computer Science Department, Brigham Young Uni...","Gautam, A., Computer Science Department, Brigh...",Environment; Goal(s); Intelligent agents; Prof...
1,"de Moura Oliveira P.B., Hedengren J.D., Boaven...",6508306234;9277159100;6507358470;,"INESC-TEC Technology and Science, Campus da FE...","de Moura Oliveira, P.B., INESC-TEC Technology ...",
2,"Hajimirzaie S.M., Hotchkiss R.H.",53879700900;26642910500;,"Task Committee Secretary and Lead Engineer, Op...","Hajimirzaie, S.M., Task Committee Secretary an...",
3,"Bunnell S., Gorrell S., Salmon J., Thelin C., ...",57204031131;6602649851;56830076500;57204029674...,"Department of Mechanical Engineering, Brigham ...","Bunnell, S., Department of Mechanical Engineer...",Computer-aided engineering; Data-driven engine...
4,"Richardson M., Cannon S., Teichert L., Vance A...",57217149830;57217149403;57217145377;5721714653...,"School of Communications, Brigham Young Univer...","Richardson, M., School of Communications, Brig...",Dating; Emerging adults; Mobile phones; Uses a...


In [46]:
authors_df.rename(columns=
              {'Authors':'name', 
               'Author(s) ID': 'author_ids',
               'Affiliations':'affiliation',
               'Author Keywords':'author_keywords'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  authors_df.rename(columns=


In [50]:
authors = list()
def filter_authors(x):
    def filter_affiliations(y):
        try:            
            aff = y.split("., ")
            return "".join(aff[1:])
        except ValueError as e:
            print(y)
            raise e
    names = x['name'].split(",") if x['name'] else None
    author_ids = x['author_ids'].split(";") if x['author_ids'] else None
    author_keywords = x['author_keywords'].split(";") if x['author_keywords'] and isinstance(x['author_keywords'], str) else None
    auth_affiliations = x['Authors with affiliations'].split(";") if x['Authors with affiliations'] else None
    if not len(author_ids[len(author_ids) - 1]): del author_ids[len(author_ids) - 1]
    
    if len(names) == len(author_ids) == len(auth_affiliations):
        for index, name in enumerate(names):
            author = dict() 
            author['author_id'] = author_ids[index]
            author['name'] = name
            author['affiliations'] = filter_affiliations(auth_affiliations[index])
            author['keywords'] = author_keywords
            authors.append(author)

In [51]:
_ = authors_df.apply(lambda x: filter_authors(x), axis=1)

In [52]:
len(authors)

3974

In [53]:
authors_df = pd.DataFrame(authors)

In [196]:
authors_df.head(2)

Unnamed: 0,author_id,name,affiliations,keywords
0,57218202833,Gautam A.,"Computer Science Department, Brigham Young Uni...","[Environment, Goal(s), Intelligent agents, ..."
1,7004904337,Crandall J.W.,"Computer Science Department, Brigham Young Uni...","[Environment, Goal(s), Intelligent agents, ..."


In [201]:
_authors = dict()
def filter_authors_affiliation(x):
    if x['author_id'] not in _authors.keys(): _authors[x['author_id']] = dict()

    _authors[x['author_id']]['name'] = x['name']
    
    if x['affiliations']:        
        if 'affiliations' not in _authors[x['author_id']].keys():
            _authors[x['author_id']]['affiliations'] = list()
        _authors[x['author_id']]['affiliations'].append(x['affiliations'])
    
    if x['keywords']:        
        if 'keywords' not in _authors[x['author_id']].keys():
            _authors[x['author_id']]['keywords'] = list()
        _authors[x['author_id']]['keywords'].extend(x['keywords'])

In [202]:
_ = authors_df.apply(lambda x: filter_authors_affiliation(x), axis=1)

In [204]:
# authors_df.groupby('author_id').apply(lambda x: filter_authors_affiliation(x))
auth = list()
for key, value in _authors.items():
    a = dict()
    a['author_id'] = key
    a.update(value)
    auth.append(a)

In [208]:
authors_df = pd.DataFrame(auth)

Save as `authors.csv` file

In [209]:
authors_df.to_csv(join(data_dir, "authors.csv"), index=False)

##### For relations (bridge tables)

`Document && Author`

In [90]:
cols = ['author_ids', 'document_id']
doc_auth_df = document_df[cols]
doc_auth_df.head(2)

Unnamed: 0,author_ids,document_id
0,"[57218202833, 7004904337, 7005513246]",0
1,"[6508306234, 9277159100, 6507358470]",1


In [94]:
doc_authors = list()
def filter_doc_authors(x):
    author_ids = x['author_ids'] if x['author_ids'] and isinstance(x['author_ids'], list) else None
    document_id = x['document_id']
    
    for index, author_id in enumerate(author_ids):
        doc_auth = dict()
        doc_auth['author_id'] = author_id
        doc_auth['document_id'] = document_id
        doc_authors.append(doc_auth)

In [95]:
_ = doc_auth_df.apply(lambda x: filter_doc_authors(x), axis=1)

Save as `document_author.csv` file

In [118]:
doc_auth_df = pd.DataFrame(doc_authors)
doc_auth_df.to_csv(join(data_dir, "document_author.csv"), index=False)

`Document && Keywords`

In [107]:
cols = ['keywords', 'document_id']
doc_keywords_df = document_df[cols]
doc_keywords_df.head(2)

Unnamed: 0,keywords,document_id
0,"[Environment, Goal(s), Intelligent agents, ...",0
1,,1


Remove rows where there are no keywords

In [108]:
doc_keywords_df.drop(doc_keywords_df[doc_keywords_df['keywords'].isnull()].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doc_keywords_df.drop(doc_keywords_df[doc_keywords_df['keywords'].isnull()].index, inplace=True)


In [111]:
doc_keywords_df.head(2)

Unnamed: 0,keywords,document_id
0,"[Environment, Goal(s), Intelligent agents, ...",0
3,"[Computer-aided engineering, Data-driven engi...",3


In [112]:
doc_keywords = list()
def filter_doc_keywords(x):
    keywords = x['keywords'] if x['keywords'] and isinstance(x['keywords'], list) else None
    document_id = x['document_id']
    
    for index, keyword in enumerate(keywords):
        doc_keyword = dict()
        doc_keyword['keyword'] = keyword
        doc_keyword['document_id'] = document_id
        doc_keywords.append(doc_keyword)

In [113]:
_ = doc_keywords_df.apply(lambda x: filter_doc_keywords(x), axis=1)

Save as `document_keyword.csv` file

In [122]:
doc_keywords_df = pd.DataFrame(doc_keywords)
doc_keywords_df.to_csv(join(data_dir, "document_keyword.csv"), index=False)

`Author && Keywords`

In [123]:
cols = ['author_id', 'keywords']
auth_keywords_df = authors_df[cols]
auth_keywords_df.head(2)

Unnamed: 0,author_id,keywords
0,57218202833,"[Environment, Goal(s), Intelligent agents, ..."
1,7004904337,"[Environment, Goal(s), Intelligent agents, ..."


In [125]:
auth_keywords_df.drop(auth_keywords_df[auth_keywords_df['keywords'].isnull()].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auth_keywords_df.drop(auth_keywords_df[auth_keywords_df['keywords'].isnull()].index, inplace=True)


In [126]:
auth_keywords_df.head()

Unnamed: 0,author_id,keywords
0,57218202833,"[Environment, Goal(s), Intelligent agents, ..."
1,7004904337,"[Environment, Goal(s), Intelligent agents, ..."
2,7005513246,"[Environment, Goal(s), Intelligent agents, ..."
8,57204031131,"[Computer-aided engineering, Data-driven engi..."
9,6602649851,"[Computer-aided engineering, Data-driven engi..."


In [127]:
auth_keywords = list()
def filter_auth_keywords(x):
    keywords = x['keywords'] if x['keywords'] and isinstance(x['keywords'], list) else None
    author_id = x['author_id']
    
    for index, keyword in enumerate(keywords):
        auth_keyword = dict()
        auth_keyword['keyword'] = keyword
        auth_keyword['author_id'] = author_id
        auth_keywords.append(auth_keyword)

In [128]:
_ = auth_keywords_df.apply(lambda x: filter_auth_keywords(x), axis=1)

Save as `author_keyword.csv` file

In [130]:
auth_keywords_df = pd.DataFrame(auth_keywords)
auth_keywords_df.to_csv(join(data_dir, "author_keyword.csv"), index=False)

`Author && Affiliation`

In [221]:
cols = ['author_id', 'affiliations']
auth_affiliations_df = authors_df[cols]

In [224]:
auth_affiliations_df.head(2)

Unnamed: 0,author_id,affiliations
0,57218202833,"[Computer Science Department, Brigham Young Un..."
1,7004904337,"[Computer Science Department, Brigham Young Un..."


In [223]:
auth_affiliations_df.drop(auth_affiliations_df[auth_affiliations_df['affiliations'].isnull()].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auth_affiliations_df.drop(auth_affiliations_df[auth_affiliations_df['affiliations'].isnull()].index, inplace=True)


In [228]:
auth_affiliations = list()
def filter_auth_affiliations(x):
    affiliations = x['affiliations'] if x['affiliations'] and isinstance(x['affiliations'], list) else None
    author_id = x['author_id']
    
    for index, affiliation in enumerate(affiliations):
        auth_affiliation = dict()
        auth_affiliation['affiliation'] = affiliation
        auth_affiliation['author_id'] = author_id
        auth_affiliations.append(auth_affiliation)

In [229]:
_ = auth_affiliations_df.apply(lambda x: filter_auth_affiliations(x), axis=1)

Save as `author_keyword.csv` file

In [231]:
auth_affiliations_df = pd.DataFrame(auth_affiliations)
auth_affiliations_df.to_csv(join(data_dir, "author_affiliation.csv"), index=False)