In [1]:
import pandas as pd
import numpy as np

In [2]:
# Get data
aff = pd.read_csv('data/cleaned_data/clean_affiliations.csv', index_col=0)
coauth = pd.read_csv('data/cleaned_data/clean_coauthors.csv', index_col=0)
jour = pd.read_csv('data/cleaned_data/clean_journals.csv', index_col=0)
scopus = pd.read_csv('data/cleaned_data/clean_scopus.csv', index_col=0)
ninds = pd.read_csv('data/cleaned_data/clean_ninds.csv', index_col=0)

  mask |= (ar1 == a)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# U.S. regions
northeast = ['ct', 'me', 'ma', 'nh', 'ri', 'vt', 'nj', 'ny', 'pa']
midwest = ['in', 'il', 'mi', 'oh', 'wi', 'ia', 'ks', 'mn', 'mo', 'ne', 'nd', 'sd']
south = ['de', 'fl', 'ga', 'md', 'nc', 'sc', 'va', 'wv', 'al', 'ky', 'ms', 'tn', 'ar', 'la', 'ok', 'tx']
west = ['mt', 'wy', 'co', 'nm', 'az', 'ut', 'id', 'nv', 'or', 'wa', 'ca', 'ak', 'hi']

In [4]:
# Top 25 global universities for neuroscience and behavior, 
# https://www.usnews.com/education/best-global-universities/neuroscience-behavior
top_neuro = ['harvard university', 'stanford university', 'university college london', 
             'university of california, san francisco', 'massachusetts institute of technology',
             'columbia university', 'university of oxford', 'university of pennsylvania', 
             'washinton university in st. louis', 'johns hopkins university', 
             'university of california, los angeles', 'university of california, san diego', 
             'university of cambridge', 'yale university', 'new york university', "king's college london", 
             'university of toronto', 'mcgill university', 'karolinska institute', 
             'university of california, berkeley', 'university of washington', 'mount sinai school of medicine', 
             'imperial college london', 'heidelberg university', 'university of pittsburgh']

In [5]:
# Top 25 publications for biochemistry, genetics and molecular biology based on h-index,
# https://www.scimagojr.com/journalrank.php?order=h&ord=desc&area=1300
top_biochem_gen_molec_pub = ['cell', 'journal of the americal chemical society', 'nature genetics', 
                             'nature medicine', 'journal of clinical oncology', 'journal of biological chemistry', 
                             'nucleic acids research', 'blood', 'cancer research', 'genes and development', 
                             'nature biotechnology', 'nature reviews cancer', 
                             'nature reviews molecular cell biology', 'embo journal', 'journal of cell biology', 
                             'molecular cell', 'nature cell biology', 'journal of the national cancer institute', 
                             'bioinformatics', 'biomaterials', 'journal of clinical endocrinology and metabolism', 
                             'oncohene', 'nature reviews genetics', 'physiological reviews', 
                             'molecular and cell biology']

In [6]:
# Top 25 publications for neuroscience based on h-index,
# https://www.scimagojr.com/journalrank.php?order=h&ord=desc&area=2800
top_neuro_pub = ['neuron', 'journal of neuroscience', 'nature neuroscience', 'embo journal', 
                 'nature reviews neuroscience', 'neuroimage', 'stroke', 'biological psychiatry', 
                 'annals of neurology', 'trends in neurosciences', 'trends in cognitive scienes', 
                 'pain', 'journal of neurophysiology', 'annual review of neuroscience', 'cerebral cortex', 
                 'annals of the new york academy of sciences', 'plos biology', 'journal of neurochemistry', 
                 'progress in neurobiology', 'current opinion in neurobiology', 
                 'neuroscience and biobehavioral reviews', 'neuroscience', 
                 'investigative ophthalmology and visual science', 'journal of cognitive neuroscience', 
                 'journal of comparative neurology']

In [7]:
# Top 25 publications for medicine based on h-index, 
# https://www.scimagojr.com/journalrank.php?area=2700&order=h&ord=desc
top_med_pub = ['new england journal of medicine', 'the lancet', 
               'jama - journal of the american medical association', 'circulation', 'nature medicine', 
               'journal of clinical oncology', 'journal of clinical investigation', 'blood', 'cancer research', 
               'journal of experimental medicine', 'journal of the american college of cardiology', 
               'british medical journal', 'nature reviews cancer', 'embo journal', 'gastroenterology', 
               'journal of physical chemistry b', 'annals of internal medicine', 'journal of cell biology', 
               'immunity', 'accounts of chemical research', 
               'american journal of respiratory and critical care medicine', 'nature reviews immunology', 
               'jama psychiatry', 'journal of the national cancer institute', 'neurology']

## feature engineer ninds

In [8]:
# Obtain ninds that is found in aff by inner joining ninds and aff on city, state, scopus_idx
ninds = pd.merge(ninds, aff, left_on=['ninds_aff_city', 'ninds_aff_state', 'scopus_idx'], 
                 right_on=['aff_city', 'aff_state', 'scopus_idx'], how='inner')

In [9]:
# Drop aff columns
ninds = ninds.drop(columns=ninds.columns[14:])

In [10]:
# Drop duplicate rows
ninds = ninds.drop_duplicates()

In [11]:
# Drop ninds_grant_application_id 
ninds = ninds.drop(columns='ninds_grant_application_id')

In [12]:
# One hot encode ninds_grant_type
ninds = pd.concat([ninds, pd.get_dummies(ninds['ninds_grant_type'], prefix='ninds_grant_type')], 
                  axis=1).drop(['ninds_grant_type'], axis=1)

In [13]:
# One hot encode ninds_fiscal_year
ninds = pd.concat([ninds, pd.get_dummies(ninds['ninds_fiscal_year'], prefix='ninds_grant_fiscal_year')], 
                  axis=1).drop(['ninds_fiscal_year'], axis=1)

In [14]:
# Get ninds_project_duration_days by subtracting ninds_project_start_date from ninds_project_end_date 
ninds['ninds_project_end_date'] = pd.to_datetime(ninds['ninds_project_end_date'])
ninds['ninds_project_start_date'] = pd.to_datetime(ninds['ninds_project_start_date'])
ninds['ninds_project_duration_days'] = (ninds['ninds_project_end_date'] - ninds['ninds_project_start_date']).dt.days
ninds = ninds.drop(columns=['ninds_project_end_date', 'ninds_project_start_date'])

In [15]:
# Encode ninds_other_project_leader, 1 if other_project_leader, 0 if not applicable
ninds['ninds_grant_other_project_leader'] = np.where(ninds[['ninds_other_project_leader']] == 'not applicable', 0, 1)
ninds = ninds.drop(columns='ninds_other_project_leader')

In [16]:
# Encode ninds_aff_name based on top_neuro, 1 if in top_neuro, 0 if not in top_neuro
ninds['ninds_grant_aff_top_25_neuro'] = ninds['ninds_aff_name'].apply(lambda x: 0 if x not in top_neuro else 1)
ninds = ninds.drop(columns='ninds_aff_name')

In [17]:
# One hot encode ninds_aff_type
ninds = pd.concat([ninds, pd.get_dummies(ninds['ninds_aff_type'], prefix='ninds_grant_aff_type')], 
                  axis=1).drop(['ninds_aff_type'], axis=1)
ninds.columns = ninds.columns.str.replace('  ', ' ')
ninds.columns = ninds.columns.str.replace(' ', '_')

In [18]:
# Drop ninds_aff_city
ninds = ninds.drop(columns='ninds_aff_city')

In [19]:
# Replace ninds_aff_state with U.S. region or international
ninds['ninds_aff_state'] = ninds['ninds_aff_state'].replace(northeast, 'northeast')
ninds['ninds_aff_state'] = ninds['ninds_aff_state'].replace(midwest, 'midwest')
ninds['ninds_aff_state'] = ninds['ninds_aff_state'].replace(south, 'south')
ninds['ninds_aff_state'] = ninds['ninds_aff_state'].replace(west, 'west')
ninds['ninds_aff_state'] = ninds['ninds_aff_state'].apply(lambda x: x if x in 
                                                          ['northeast', 'midwest', 'south', 'west'] 
                                                          else 'international')
# One hot encode ninds_aff_state
ninds = pd.concat([ninds, pd.get_dummies(ninds['ninds_aff_state'], prefix='ninds_grant_aff')], 
                  axis=1).drop(['ninds_aff_state'], axis=1)

In [20]:
# Encode ninds_aff_country, 1 if united states, 0 if international
ninds['ninds_grant_aff_united_states'] = np.where(ninds[['ninds_aff_country']] == 'united states', 1, 0)
ninds = ninds.drop(columns='ninds_aff_country')

In [21]:
# Drop NA
ninds = ninds.dropna()

In [22]:
# Fix ninds_support_year
ninds['ninds_grant_support_year'] = ninds['ninds_support_year'].astype(float)
ninds = ninds.drop(columns='ninds_support_year')

In [23]:
# Group by scopus_idx and sum to get total ninds
ninds_total = ninds.groupby('scopus_idx').sum()
ninds_total = ninds_total.reset_index()
ninds_total.columns = ninds_total.columns.str.replace('ninds_', 'ninds_total_')
ninds_total = ninds_total.rename(columns={'ninds_total_fiscal_year_total_cost': 'ninds_total_cost'})

In [24]:
# Group by scopus_idx and average to get average ninds
ninds_avg = ninds.groupby('scopus_idx').mean().round(0)
ninds_avg = ninds_avg.reset_index()
ninds_avg.columns = ninds_avg.columns.str.replace('ninds_', 'ninds_avg_')

In [25]:
# Join ninds_total and ninds_avg
ninds = pd.concat([ninds_total.drop('ninds_total_grant_support_year', axis=1), 
                   ninds_avg['ninds_avg_grant_support_year']], axis=1, join='inner')

In [26]:
# Check ninds columns
ninds.columns.to_list()

['scopus_idx',
 'ninds_total_cost',
 'ninds_total_grant_type_d43',
 'ninds_total_grant_type_dp1',
 'ninds_total_grant_type_dp2',
 'ninds_total_grant_type_f05',
 'ninds_total_grant_type_f06',
 'ninds_total_grant_type_f30',
 'ninds_total_grant_type_f31',
 'ninds_total_grant_type_f32',
 'ninds_total_grant_type_f33',
 'ninds_total_grant_type_f99',
 'ninds_total_grant_type_k01',
 'ninds_total_grant_type_k02',
 'ninds_total_grant_type_k04',
 'ninds_total_grant_type_k06',
 'ninds_total_grant_type_k08',
 'ninds_total_grant_type_k12',
 'ninds_total_grant_type_k17',
 'ninds_total_grant_type_k22',
 'ninds_total_grant_type_k23',
 'ninds_total_grant_type_k24',
 'ninds_total_grant_type_k25',
 'ninds_total_grant_type_k99',
 'ninds_total_grant_type_n01',
 'ninds_total_grant_type_n02',
 'ninds_total_grant_type_n43',
 'ninds_total_grant_type_n44',
 'ninds_total_grant_type_p01',
 'ninds_total_grant_type_p20',
 'ninds_total_grant_type_p30',
 'ninds_total_grant_type_p50',
 'ninds_total_grant_type_r00',
 'n

In [27]:
# Check ninds
ninds.head()

Unnamed: 0,scopus_idx,ninds_total_cost,ninds_total_grant_type_d43,ninds_total_grant_type_dp1,ninds_total_grant_type_dp2,ninds_total_grant_type_f05,ninds_total_grant_type_f06,ninds_total_grant_type_f30,ninds_total_grant_type_f31,ninds_total_grant_type_f32,...,ninds_total_grant_aff_type_schools_of_public_health,ninds_total_grant_aff_type_schools_of_veterinary_medicine,ninds_total_grant_aff_type_unavailable,ninds_total_grant_aff_international,ninds_total_grant_aff_midwest,ninds_total_grant_aff_northeast,ninds_total_grant_aff_south,ninds_total_grant_aff_west,ninds_total_grant_aff_united_states,ninds_avg_grant_support_year
0,a a tzika,688130.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1.0
1,a d redish,395150.0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,2,1.0
2,a g smith,28000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,2,1.0
3,a k mcallister,2537393.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7,7,5.0
4,a l betz,1072317.0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,0,0,5,9.0


In [28]:
# Get ninds shape
ninds.shape

(7335, 135)

## feature engineer affiliations

In [29]:
# Encode aff_name based on top_neuro, 1 if in top_neuro, 0 if not in top_neuro
aff['aff_top_25_neuro'] = aff['aff_name'].apply(lambda x: 0 if x not in top_neuro else 1)
aff = aff.drop(columns='aff_name')

In [30]:
# Drop aff_city
aff = aff.drop(columns='aff_city')

In [31]:
# Replace aff_state with U.S. region or international
aff['aff_state'] = aff['aff_state'].replace(northeast, 'northeast')
aff['aff_state'] = aff['aff_state'].replace(midwest, 'midwest')
aff['aff_state'] = aff['aff_state'].replace(south, 'south')
aff['aff_state'] = aff['aff_state'].replace(west, 'west')
aff['aff_state'] = aff['aff_state'].apply(lambda x: x if x in 
                                          ['northeast', 'midwest', 'south', 'west'] 
                                          else 'international') 
# One hot encode aff_state
aff = pd.concat([aff, pd.get_dummies(aff['aff_state'], prefix='aff')], axis=1).drop(['aff_state'], axis=1)

In [32]:
# Encode ninds_aff_country, 1 if united states, 0 if international
aff['aff_united_states'] = np.where(aff[['aff_country']] == 'united states', 1, 0)
aff = aff.drop(columns='aff_country')

In [33]:
# Group by scopus_idx and sum to get total aff
aff_total = aff.groupby('scopus_idx').sum()
aff_total = aff_total.reset_index()
aff_total.columns = aff_total.columns.str.replace('aff_', 'aff_total_')

In [34]:
# Group by scopus_idx and average to average aff
aff_avg = aff.groupby('scopus_idx').mean().round(0)
aff_avg = aff_avg.reset_index()
aff_avg.columns = aff_avg.columns.str.replace('aff_', 'aff_avg_')

In [35]:
# Join aff_total and aff_avg
aff = pd.concat([aff_total.iloc[:, 3:], aff_avg.iloc[:, 0:3]], axis=1, join='inner')

In [36]:
# Check aff
aff.head()

Unnamed: 0,aff_total_top_25_neuro,aff_total_international,aff_total_midwest,aff_total_northeast,aff_total_south,aff_total_west,aff_total_united_states,scopus_idx,aff_avg_author_count,aff_avg_document_count
0,0,0,0,1,1,0,2,a dosemeci,2255.0,11868.0
1,1,6,2,0,1,0,3,a kulkarni,8215.0,40727.0
2,0,0,0,0,1,0,1,a lasansky,12448.0,210824.0
3,1,0,1,1,3,5,10,a wynshaw-boris,6755.0,54793.0
4,0,0,0,4,1,1,6,a a tzika,11950.0,81374.0


In [37]:
# Join ninds and aff
model_data = pd.merge(ninds, aff, left_on=['scopus_idx'], right_on=['scopus_idx'], how='inner')

In [38]:
# Check model_data columns
model_data.columns.to_list()

['scopus_idx',
 'ninds_total_cost',
 'ninds_total_grant_type_d43',
 'ninds_total_grant_type_dp1',
 'ninds_total_grant_type_dp2',
 'ninds_total_grant_type_f05',
 'ninds_total_grant_type_f06',
 'ninds_total_grant_type_f30',
 'ninds_total_grant_type_f31',
 'ninds_total_grant_type_f32',
 'ninds_total_grant_type_f33',
 'ninds_total_grant_type_f99',
 'ninds_total_grant_type_k01',
 'ninds_total_grant_type_k02',
 'ninds_total_grant_type_k04',
 'ninds_total_grant_type_k06',
 'ninds_total_grant_type_k08',
 'ninds_total_grant_type_k12',
 'ninds_total_grant_type_k17',
 'ninds_total_grant_type_k22',
 'ninds_total_grant_type_k23',
 'ninds_total_grant_type_k24',
 'ninds_total_grant_type_k25',
 'ninds_total_grant_type_k99',
 'ninds_total_grant_type_n01',
 'ninds_total_grant_type_n02',
 'ninds_total_grant_type_n43',
 'ninds_total_grant_type_n44',
 'ninds_total_grant_type_p01',
 'ninds_total_grant_type_p20',
 'ninds_total_grant_type_p30',
 'ninds_total_grant_type_p50',
 'ninds_total_grant_type_r00',
 'n

In [39]:
# Check model_data
model_data.head()

Unnamed: 0,scopus_idx,ninds_total_cost,ninds_total_grant_type_d43,ninds_total_grant_type_dp1,ninds_total_grant_type_dp2,ninds_total_grant_type_f05,ninds_total_grant_type_f06,ninds_total_grant_type_f30,ninds_total_grant_type_f31,ninds_total_grant_type_f32,...,ninds_avg_grant_support_year,aff_total_top_25_neuro,aff_total_international,aff_total_midwest,aff_total_northeast,aff_total_south,aff_total_west,aff_total_united_states,aff_avg_author_count,aff_avg_document_count
0,a a tzika,688130.0,0,0,0,0,0,0,0,0,...,1.0,0,0,0,4,1,1,6,11950.0,81374.0
1,a d redish,395150.0,0,0,0,0,0,0,0,0,...,1.0,0,0,1,0,0,1,2,33555.0,153450.0
2,a g smith,28000.0,0,0,0,0,0,0,0,0,...,1.0,0,17,1,2,3,1,7,944.0,6621.0
3,a k mcallister,2537393.0,0,0,0,0,0,0,0,0,...,5.0,0,0,0,0,2,3,5,11989.0,66075.0
4,a l betz,1072317.0,0,0,0,0,0,0,0,0,...,9.0,0,0,2,0,0,0,2,25424.0,129784.0


In [40]:
# Drop NA
model_data = model_data.dropna()

In [41]:
# Get model_data shape
model_data.shape

(7335, 144)

## feature engineer coauthors

In [42]:
# Encode coauth_aff_name based on top_neuro, 1 if in top 25, 
# 0 if not in top 25,
coauth['coauth_aff_top_25_neuro'] = coauth['coauth_aff_name'].apply(lambda x: 0 if x not in top_neuro else 1)
coauth = coauth.drop(columns='coauth_aff_name')

In [43]:
# Drop coauth_aff_city
coauth = coauth.drop(columns='coauth_aff_city')

In [44]:
# Encode coauth_aff_country
coauth['coauth_aff_united_states'] = np.where(coauth[['coauth_aff_country']] == 'united states', 1, 0)
coauth['coauth_aff_international'] = np.where(coauth[['coauth_aff_country']] != 'united states', 1, 0)
coauth = coauth.drop(columns='coauth_aff_country')

In [45]:
# Group by scopus_idx and sum to get total coauth
coauth = coauth.groupby('scopus_idx').sum()
coauth = coauth.reset_index()
coauth.columns = coauth.columns.str.replace('coauth_', 'coauth_total_')

In [46]:
# Check coauth
coauth.head()

Unnamed: 0,scopus_idx,coauth_total_aff_top_25_neuro,coauth_total_aff_united_states,coauth_total_aff_international
0,a dosemeci,1,66,9
1,a kulkarni,21,52,108
2,a lasansky,1,4,6
3,a wynshaw-boris,18,104,56
4,a a tzika,16,125,29


In [47]:
# Join model_data and coauth
model_data = pd.merge(model_data, coauth, left_on=['scopus_idx'], right_on=['scopus_idx'], how='inner')

In [48]:
# Check model_data columns
model_data.columns.to_list()

['scopus_idx',
 'ninds_total_cost',
 'ninds_total_grant_type_d43',
 'ninds_total_grant_type_dp1',
 'ninds_total_grant_type_dp2',
 'ninds_total_grant_type_f05',
 'ninds_total_grant_type_f06',
 'ninds_total_grant_type_f30',
 'ninds_total_grant_type_f31',
 'ninds_total_grant_type_f32',
 'ninds_total_grant_type_f33',
 'ninds_total_grant_type_f99',
 'ninds_total_grant_type_k01',
 'ninds_total_grant_type_k02',
 'ninds_total_grant_type_k04',
 'ninds_total_grant_type_k06',
 'ninds_total_grant_type_k08',
 'ninds_total_grant_type_k12',
 'ninds_total_grant_type_k17',
 'ninds_total_grant_type_k22',
 'ninds_total_grant_type_k23',
 'ninds_total_grant_type_k24',
 'ninds_total_grant_type_k25',
 'ninds_total_grant_type_k99',
 'ninds_total_grant_type_n01',
 'ninds_total_grant_type_n02',
 'ninds_total_grant_type_n43',
 'ninds_total_grant_type_n44',
 'ninds_total_grant_type_p01',
 'ninds_total_grant_type_p20',
 'ninds_total_grant_type_p30',
 'ninds_total_grant_type_p50',
 'ninds_total_grant_type_r00',
 'n

In [49]:
# Check model_data
model_data.head()

Unnamed: 0,scopus_idx,ninds_total_cost,ninds_total_grant_type_d43,ninds_total_grant_type_dp1,ninds_total_grant_type_dp2,ninds_total_grant_type_f05,ninds_total_grant_type_f06,ninds_total_grant_type_f30,ninds_total_grant_type_f31,ninds_total_grant_type_f32,...,aff_total_midwest,aff_total_northeast,aff_total_south,aff_total_west,aff_total_united_states,aff_avg_author_count,aff_avg_document_count,coauth_total_aff_top_25_neuro,coauth_total_aff_united_states,coauth_total_aff_international
0,a a tzika,688130.0,0,0,0,0,0,0,0,0,...,0,4,1,1,6,11950.0,81374.0,16,125,29
1,a d redish,395150.0,0,0,0,0,0,0,0,0,...,1,0,0,1,2,33555.0,153450.0,17,99,61
2,a g smith,28000.0,0,0,0,0,0,0,0,0,...,1,2,3,1,7,944.0,6621.0,4,32,128
3,a k mcallister,2537393.0,0,0,0,0,0,0,0,0,...,0,0,2,3,5,11989.0,66075.0,4,61,2
4,a l betz,1072317.0,0,0,0,0,0,0,0,0,...,2,0,0,0,2,25424.0,129784.0,2,85,31


In [50]:
# Drop NA
model_data = model_data.dropna()

In [51]:
# Get model_data shape
model_data.shape

(7332, 147)

## feature engineer journals

In [52]:
# Encode publication_name based on top_biochem_gen_molec_pub, 1 if in top_biochem_gen_molec_pub, 
# 0 if not in top_biochem_gen_molec_pub 
jour['jour_pub_top_25_biochem_gen_molec'] = jour['jour_publication_name'].apply(lambda x: 0 if x not in 
                                                                                top_biochem_gen_molec_pub else 1)

In [53]:
# Encode publication_name based on top_neuro_pub, 1 if in top_neuro_pub, 0 if not in top_neuro_pub
jour['jour_pub_top_25_neuro'] = jour['jour_publication_name'].apply(lambda x: 0 if x not in top_neuro_pub else 1)

In [54]:
# Encode publication_name based on top_med_pub, 1 if in top_med_pub, 0 if not in top_med_pub
jour['jour_pub_top_25_med'] = jour['jour_publication_name'].apply(lambda x: 0 if x not in top_med_pub else 1)

In [55]:
# Drop publication_name
jour = jour.drop(columns='jour_publication_name')

In [56]:
# One hot encode jour_publication_type
jour['jour_publication_type'] = jour['jour_publication_type'].replace("[{'$': 'journal'}, {'$': 'ar'}]", 'journal')
jour['jour_publication_type'] = jour['jour_publication_type'].replace("[{'$': '1353'}, {'$': 'journal'}]", 'journal')
jour = pd.concat([jour, pd.get_dummies(jour['jour_publication_type'], prefix='jour_pub_type')], 
                 axis=1).drop(['jour_publication_type'], axis=1)
jour.columns = jour.columns.str.replace(' ', '_')

In [57]:
# Group by scopus_idx and sum to get total jour
jour_total = jour.groupby('scopus_idx').sum()
jour_total = jour_total.reset_index()
jour_total.columns = jour_total.columns.str.replace('jour_', 'jour_total_')

In [58]:
# Group by scopus_idx and average to get average jour
jour_avg = jour.groupby('scopus_idx').mean().round(0)
jour_avg = jour_avg.reset_index()
jour_avg.columns = jour_avg.columns.str.replace('jour_', 'jour_avg_')

In [59]:
# Join jour_total and jour_avg
jour = pd.concat([jour_total.iloc[:, 2:], jour_avg.iloc[:, 0:2]], axis=1, join='inner')

In [60]:
# Check jour
jour.head()

Unnamed: 0,jour_total_pub_top_25_biochem_gen_molec,jour_total_pub_top_25_neuro,jour_total_pub_top_25_med,jour_total_pub_type_book,jour_total_pub_type_book_series,jour_total_pub_type_conference_proceeding,jour_total_pub_type_journal,jour_total_pub_type_report,jour_total_pub_type_trade_journal,scopus_idx,jour_avg_author_count
0,2,19,0,0.0,0.0,0.0,60.0,0.0,0.0,a dosemeci,5.0
1,1,0,7,7.0,1.0,1.0,222.0,0.0,0.0,a kulkarni,8.0
2,1,0,1,0.0,1.0,0.0,29.0,0.0,0.0,a lasansky,2.0
3,56,24,20,3.0,9.0,0.0,210.0,0.0,0.0,a wynshaw-boris,10.0
4,0,3,0,1.0,1.0,3.0,68.0,0.0,0.0,a a tzika,10.0


In [61]:
# Join model_data and jour
model_data = pd.merge(model_data, jour, left_on=['scopus_idx'], right_on=['scopus_idx'], how='inner')

In [62]:
# Check model_data columns
model_data.columns.to_list()

['scopus_idx',
 'ninds_total_cost',
 'ninds_total_grant_type_d43',
 'ninds_total_grant_type_dp1',
 'ninds_total_grant_type_dp2',
 'ninds_total_grant_type_f05',
 'ninds_total_grant_type_f06',
 'ninds_total_grant_type_f30',
 'ninds_total_grant_type_f31',
 'ninds_total_grant_type_f32',
 'ninds_total_grant_type_f33',
 'ninds_total_grant_type_f99',
 'ninds_total_grant_type_k01',
 'ninds_total_grant_type_k02',
 'ninds_total_grant_type_k04',
 'ninds_total_grant_type_k06',
 'ninds_total_grant_type_k08',
 'ninds_total_grant_type_k12',
 'ninds_total_grant_type_k17',
 'ninds_total_grant_type_k22',
 'ninds_total_grant_type_k23',
 'ninds_total_grant_type_k24',
 'ninds_total_grant_type_k25',
 'ninds_total_grant_type_k99',
 'ninds_total_grant_type_n01',
 'ninds_total_grant_type_n02',
 'ninds_total_grant_type_n43',
 'ninds_total_grant_type_n44',
 'ninds_total_grant_type_p01',
 'ninds_total_grant_type_p20',
 'ninds_total_grant_type_p30',
 'ninds_total_grant_type_p50',
 'ninds_total_grant_type_r00',
 'n

In [63]:
# Check model_data
model_data.head()

Unnamed: 0,scopus_idx,ninds_total_cost,ninds_total_grant_type_d43,ninds_total_grant_type_dp1,ninds_total_grant_type_dp2,ninds_total_grant_type_f05,ninds_total_grant_type_f06,ninds_total_grant_type_f30,ninds_total_grant_type_f31,ninds_total_grant_type_f32,...,jour_total_pub_top_25_biochem_gen_molec,jour_total_pub_top_25_neuro,jour_total_pub_top_25_med,jour_total_pub_type_book,jour_total_pub_type_book_series,jour_total_pub_type_conference_proceeding,jour_total_pub_type_journal,jour_total_pub_type_report,jour_total_pub_type_trade_journal,jour_avg_author_count
0,a a tzika,688130.0,0,0,0,0,0,0,0,0,...,0,3,0,1.0,1.0,3.0,68.0,0.0,0.0,10.0
1,a d redish,395150.0,0,0,0,0,0,0,0,0,...,0,30,0,4.0,2.0,4.0,101.0,0.0,0.0,4.0
2,a g smith,28000.0,0,0,0,0,0,0,0,0,...,6,1,16,4.0,2.0,10.0,305.0,0.0,2.0,5.0
3,a k mcallister,2537393.0,0,0,0,0,0,0,0,0,...,0,28,1,1.0,4.0,0.0,50.0,0.0,0.0,4.0
4,a l betz,1072317.0,0,0,0,0,0,0,0,0,...,0,18,0,0.0,5.0,0.0,104.0,0.0,0.0,5.0


In [64]:
# Drop NA
model_data = model_data.dropna()

In [65]:
# Get model_data shape
model_data.shape

(7326, 157)

## feature engineer scopus

In [66]:
# Drop row with scopus_begin_publication_range equal to 1900
scopus = scopus[scopus['scopus_begin_publication_range'] != 1900]

In [67]:
# Get scopus_pub_range_years by subtracting scopus_begin_publication_range from scopus_end_publication_range 
scopus['scopus_pub_range_years'] = scopus['scopus_end_publication_range'] - scopus['scopus_begin_publication_range']
scopus = scopus.drop(columns=['scopus_begin_publication_range', 'scopus_end_publication_range']) 

In [68]:
# Encode scopus_current_aff_name based top_neuro, 1 if in top_neuro, 0 if not in top_neuro 
scopus['scopus_current_aff_top_25_neuro'] = scopus['scopus_current_aff_name'].apply(lambda x: 0 if x not in top_neuro 
                                                                                    else 1)
scopus = scopus.drop(columns='scopus_current_aff_name')

In [69]:
# Replace scopus_current_aff_state with U.S. region or international
scopus['scopus_current_aff_state'] = scopus['scopus_current_aff_state'].replace(northeast, 'northeast')
scopus['scopus_current_aff_state'] = scopus['scopus_current_aff_state'].replace(midwest, 'midwest')
scopus['scopus_current_aff_state'] = scopus['scopus_current_aff_state'].replace(south, 'south')
scopus['scopus_current_aff_state'] = scopus['scopus_current_aff_state'].replace(west, 'west')
scopus['scopus_current_aff_state'] = scopus['scopus_current_aff_state'].apply(lambda x: x if x in 
                                                                              ['northeast', 'midwest', 'south', 'west'] 
                                                                              else 'international') 
# One hot encode scopus_current_aff_state
scopus = pd.concat([scopus, pd.get_dummies(scopus['scopus_current_aff_state'], prefix='scopus_current_aff')], 
                   axis=1).drop(['scopus_current_aff_state'], axis=1)

In [70]:
# Encode scopus_current_aff_country, 1 if united states, 0 if international
scopus['scopus_current_aff_united_states'] = np.where(scopus[['scopus_current_aff_country']] == 'united states', 1, 0)
scopus = scopus.drop(columns='scopus_current_aff_country')

In [71]:
# Drop duplicate scopus_idx
scopus['scopus_idx'] = scopus['scopus_idx'].drop_duplicates(keep='first')

In [72]:
# Check scopus
scopus.head()

Unnamed: 0,scopus_current_aff_auth_count,scopus_current_aff_document_count,scopus_citation_count,scopus_cited_by_count,scopus_coauthor_count,scopus_document_count,scopus_h_index,scopus_idx,scopus_pub_range_years,scopus_current_aff_top_25_neuro,scopus_current_aff_international,scopus_current_aff_midwest,scopus_current_aff_northeast,scopus_current_aff_south,scopus_current_aff_west,scopus_current_aff_united_states
0,12063,71573,17171,12803,3672,227,68,gail v johnson,35,0,0,0,1,0,0,1
1,1744,16439,8948,5286,219,141,54,kenneth o johnson,45,0,1,0,0,0,0,0
2,8517,35650,16607,12653,1177,175,46,kenneth p johnson,45,0,0,0,0,1,0,1
3,39046,173691,2454,1492,132,100,29,rodney l johnson,39,0,0,1,0,0,0,1
4,20437,110368,1720,1416,97,119,22,steven w johnson,37,0,0,0,0,1,0,1


In [73]:
# Join model_data and scopus
model_data = pd.merge(model_data, scopus, left_on=['scopus_idx'], right_on=['scopus_idx'], how='inner')

In [74]:
# Check model_data columns
model_data.columns.to_list()

['scopus_idx',
 'ninds_total_cost',
 'ninds_total_grant_type_d43',
 'ninds_total_grant_type_dp1',
 'ninds_total_grant_type_dp2',
 'ninds_total_grant_type_f05',
 'ninds_total_grant_type_f06',
 'ninds_total_grant_type_f30',
 'ninds_total_grant_type_f31',
 'ninds_total_grant_type_f32',
 'ninds_total_grant_type_f33',
 'ninds_total_grant_type_f99',
 'ninds_total_grant_type_k01',
 'ninds_total_grant_type_k02',
 'ninds_total_grant_type_k04',
 'ninds_total_grant_type_k06',
 'ninds_total_grant_type_k08',
 'ninds_total_grant_type_k12',
 'ninds_total_grant_type_k17',
 'ninds_total_grant_type_k22',
 'ninds_total_grant_type_k23',
 'ninds_total_grant_type_k24',
 'ninds_total_grant_type_k25',
 'ninds_total_grant_type_k99',
 'ninds_total_grant_type_n01',
 'ninds_total_grant_type_n02',
 'ninds_total_grant_type_n43',
 'ninds_total_grant_type_n44',
 'ninds_total_grant_type_p01',
 'ninds_total_grant_type_p20',
 'ninds_total_grant_type_p30',
 'ninds_total_grant_type_p50',
 'ninds_total_grant_type_r00',
 'n

In [75]:
# Check model_data
model_data.head()

Unnamed: 0,scopus_idx,ninds_total_cost,ninds_total_grant_type_d43,ninds_total_grant_type_dp1,ninds_total_grant_type_dp2,ninds_total_grant_type_f05,ninds_total_grant_type_f06,ninds_total_grant_type_f30,ninds_total_grant_type_f31,ninds_total_grant_type_f32,...,scopus_document_count,scopus_h_index,scopus_pub_range_years,scopus_current_aff_top_25_neuro,scopus_current_aff_international,scopus_current_aff_midwest,scopus_current_aff_northeast,scopus_current_aff_south,scopus_current_aff_west,scopus_current_aff_united_states
0,a a tzika,688130.0,0,0,0,0,0,0,0,0,...,73,29,31,0,0,0,1,0,0,1
1,a d redish,395150.0,0,0,0,0,0,0,0,0,...,111,39,24,0,0,1,0,0,0,1
2,a g smith,28000.0,0,0,0,0,0,0,0,0,...,323,52,60,0,1,0,0,0,0,0
3,a k mcallister,2537393.0,0,0,0,0,0,0,0,0,...,55,31,25,0,0,0,0,0,1,1
4,a l betz,1072317.0,0,0,0,0,0,0,0,0,...,109,44,31,0,0,1,0,0,0,1


In [76]:
# Drop NA
model_data = model_data.dropna()

In [77]:
# Get model_data shape
model_data.shape

(7325, 172)

## model data

In [78]:
# Drop scopus_idx
model_data = model_data.drop(['scopus_idx'], axis=1)

In [79]:
# Change data type
model_data = model_data.apply(pd.to_numeric) 

In [80]:
# Check model_data columns
model_data.columns.to_list()

['ninds_total_cost',
 'ninds_total_grant_type_d43',
 'ninds_total_grant_type_dp1',
 'ninds_total_grant_type_dp2',
 'ninds_total_grant_type_f05',
 'ninds_total_grant_type_f06',
 'ninds_total_grant_type_f30',
 'ninds_total_grant_type_f31',
 'ninds_total_grant_type_f32',
 'ninds_total_grant_type_f33',
 'ninds_total_grant_type_f99',
 'ninds_total_grant_type_k01',
 'ninds_total_grant_type_k02',
 'ninds_total_grant_type_k04',
 'ninds_total_grant_type_k06',
 'ninds_total_grant_type_k08',
 'ninds_total_grant_type_k12',
 'ninds_total_grant_type_k17',
 'ninds_total_grant_type_k22',
 'ninds_total_grant_type_k23',
 'ninds_total_grant_type_k24',
 'ninds_total_grant_type_k25',
 'ninds_total_grant_type_k99',
 'ninds_total_grant_type_n01',
 'ninds_total_grant_type_n02',
 'ninds_total_grant_type_n43',
 'ninds_total_grant_type_n44',
 'ninds_total_grant_type_p01',
 'ninds_total_grant_type_p20',
 'ninds_total_grant_type_p30',
 'ninds_total_grant_type_p50',
 'ninds_total_grant_type_r00',
 'ninds_total_gran

In [81]:
# Check model_data
model_data.head()

Unnamed: 0,ninds_total_cost,ninds_total_grant_type_d43,ninds_total_grant_type_dp1,ninds_total_grant_type_dp2,ninds_total_grant_type_f05,ninds_total_grant_type_f06,ninds_total_grant_type_f30,ninds_total_grant_type_f31,ninds_total_grant_type_f32,ninds_total_grant_type_f33,...,scopus_document_count,scopus_h_index,scopus_pub_range_years,scopus_current_aff_top_25_neuro,scopus_current_aff_international,scopus_current_aff_midwest,scopus_current_aff_northeast,scopus_current_aff_south,scopus_current_aff_west,scopus_current_aff_united_states
0,688130.0,0,0,0,0,0,0,0,0,0,...,73,29,31,0,0,0,1,0,0,1
1,395150.0,0,0,0,0,0,0,0,0,0,...,111,39,24,0,0,1,0,0,0,1
2,28000.0,0,0,0,0,0,0,0,0,0,...,323,52,60,0,1,0,0,0,0,0
3,2537393.0,0,0,0,0,0,0,0,0,0,...,55,31,25,0,0,0,0,0,1,1
4,1072317.0,0,0,0,0,0,0,0,0,0,...,109,44,31,0,0,1,0,0,0,1


In [82]:
# Out model_data_regression as csv
model_data.to_csv('data/model_data_regression.csv')

In [83]:
# Discretize ninds_total_cost
ninds_target_max = model_data['ninds_total_cost'].max()/5
model_data['ninds_total_cost'] = np.where(model_data['ninds_total_cost'].between(0, ninds_target_max), 1, model_data['ninds_total_cost'])
model_data['ninds_total_cost'] = np.where(model_data['ninds_total_cost'].between(ninds_target_max + 0.1, ninds_target_max*2), 2, model_data['ninds_total_cost'])
model_data['ninds_total_cost'] = np.where(model_data['ninds_total_cost'].between(ninds_target_max*2 + 0.1, ninds_target_max*3), 3, model_data['ninds_total_cost'])
model_data['ninds_total_cost'] = np.where(model_data['ninds_total_cost'].between(ninds_target_max*3 + 0.1, ninds_target_max*4), 4, model_data['ninds_total_cost'])
model_data['ninds_total_cost'] = np.where(model_data['ninds_total_cost'].between(ninds_target_max*4 + 0.1, ninds_target_max*5), 5, model_data['ninds_total_cost'])

In [84]:
# Out model_data_classification as csv
model_data.to_csv('data/model_data_classification.csv')