### Import the required Libraries

In [1]:
# importing libraries 
import random 
from nltk.corpus import names 
import nltk 
import pandas as pd 
import numpy as np
import os

### Create a function that uses Naive Bayesian Classifier to generate generate predictions based on training data

In [114]:
def gender_features(word): 
    return {'last_2letters':word[-2:], 'last_letter':word[-1], 'last_3letter':word[-3:],'last_4letter':word[-4:],'first_letter':word[0], 'first_2letter':word[:2]} 
  
def get_prediction(country, male, female):
    # preparing a list of examples and corresponding class labels. 
    labeled_names = ([(name.split(' ')[0], 'male') for name in male['given_name']]+
             [(name.split(' ')[0], 'female') for name in female['given_name']])

    n = len(male) + len(female) 
  
    random.shuffle(labeled_names) 
  
    # we use the feature extractor to process the names data. 
    featuresets = [(gender_features(n), gender)  
               for (n, gender)in labeled_names] 
  
    # Divide the resulting list of feature sets into a training set and a test set. 
    train_set, test_set = featuresets[:n-100], featuresets[n-100:] 
  
    # The training set is used to train a new "naive Bayes" classifier. 
    classifier = nltk.NaiveBayesClassifier.train(train_set)   
    #print(country, nltk.classify.accuracy(classifier, train_set))
    #print(country, nltk.classify.accuracy(classifier, test_set))
    #classifier.show_most_informative_features(200)

    return classifier

### Load the excel file that contains list of scientists whose gender is available

In [140]:
# Read the excel file into a datafram
gender = pd.read_excel("F:\BU Sem 3\RA NBER\listofauthorswithfirstname_Encoded\listofauthorswithfirstname_Encoded.xlsx")

# set author id as index
gender = gender.set_index("author_id") 

# save author's gender as dictionary
gender_dic = gender["GENDER"].to_dict() 

### Load the excel file that contains the list of authors with their country

In [141]:
country = pd.read_csv("F:\BU Sem 3\RA NBER\scopus_authors.tofind.csv")

# set author id as index
country = country.set_index("author_id") 

# save author's country as dictionary
country_dic = country["affiliation_country"].to_dict() 

In [None]:
# If author's gender is not available, assign gender= 4
for id in country.index:
    if id in gender.index:
        country.loc[id, "gender"] = gender_dic[id]
    else:
        country.loc[id, "gender"] = 4

In [14]:
country.to_excel("F:\BU Sem 3\RA NBER/auth_gend_coun.xlsx")

In [143]:
country = pd.read_excel("F:\BU Sem 3\RA NBER/auth_gend_coun.xlsx")
country = country.set_index("author_id")

### Predict gender for all authors whose gender is not available using gender of authors from that country as training data

In [None]:
countries = country.affiliation_country.unique()
for c in countries:
    print(c)

    # all male authors in that country
    male = gender[(gender['GENDER']==1) & (gender['Country']==c)] 

     # all female authors in that country
    female = gender[(gender['GENDER']==2) & (gender['Country']==c)]

    # generate a gender classifier for that country
    globals()["classify_"+c] = get_prediction(c, male, female) 

    df = country[country["Country"]==c]
    for id in df.index.unique():
        # get the name of the author
        name = df.at[id, "given_name"]

        # predict author's name 
        country.loc[id, "gender_pred"] = globals()["classify_"+c].classify(gender_features(str(name).split(' ')[0]))

        # dummy for whether gender was already assigned before prediction
        country.loc[id, "pred_dummy"] = not(df.loc[id,"gender"]==1 or df.loc[id,"gender"]==2)

### Add this data to the author-publication long dataset

In [None]:
orig = 'F:\BU Sem 3\RA NBER\long data/'
dir = os.listdir(orig)
for each in dir:
    # for every file

    # load the excel spreadsheet
    auth_data = pd.read_excel(orig+each)

    # subset data frame for focal authors whose gender is needed
    auth_sub = auth_data[auth_data["focal author"] == True]

    # set author id as index
    auth_sub = auth_sub.set_index("auid")
    for id in auth_sub.index.unique():

        # add author's country, gender, predicted gender and dummy
        auth_sub.loc[id, "country"] = country.loc[id,"affiliation_country"]
        auth_sub.loc[id, "gender"] = country.loc[id,"gender"]
        auth_sub.loc[id, "gender_pred"] = country.loc[id,"gender_pred"]
        auth_sub.loc[id, "pred_dummy"] = country.loc[id,"pred_dummy"]
        
        # create dictionaries for above variables
        country_dic = auth_sub["country"].to_dict()
        gender_dic = auth_sub["gender"].to_dict()
        gender_pred_dic = auth_sub["gender_pred"].to_dict()
        pred_dummy_dic = auth_sub["pred_dummy"].to_dict()
        
        # map the dictioary to the long dataset
        auth_data["country"] = auth_data['auid'].map(country_dic)
        auth_data["gender"] = auth_data['auid'].map(gender_dic)
        auth_data["gender_pred"] = auth_data['auid'].map(gender_pred_dic)
        auth_data["pred_dummy"] = auth_data['auid'].map(pred_dummy_dic)
    auth_data.to_excel(orig+each)