# Code for Paper  
# “How Blind is Blind? Predicting Gendered Writing Styles in Academic Articles”

## Import All Necessary Modules

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
import csv
import concurrent.futures
import os
import re
import gender_guesser.detector as gender
from collections import Counter
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from sklearn.utils import resample
from nltk.stem import PorterStemmer
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve, learning_curve
from matplotlib.colors import LogNorm
from matplotlib.cm import ScalarMappable
from concurrent.futures import ThreadPoolExecutor
import datetime
import torch.optim as optim
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import optuna


## Scraping Data

### Springer 
Firstly we collect the ID-number for the 109 journals 

In [None]:

base_url = "https://link.springer.com"
search_url = base_url + "/search/page/{}?facet-discipline=%22Economics%22&facet-content-type=%22Journal%22&facet-language=%22En%22"


headers = {
    "User-Agent": "Magnus Berg (University of Copenhagen), Accessing OA data for project ascertaining gendered writing styles in academia, email@edurome.ku.dk"
}

Journal_list = []

# Iterate over pages from 1 to 6
for page_number in range(1, 7):
    url = search_url.format(page_number)
    response = requests.get(url, headers=headers)  # Use the headers in the request
    soup = BeautifulSoup(response.content, "html.parser")

    journal_links = soup.find_all("a", class_="title")
    for link in journal_links:
        journal_url = link.get("href")
        journal_number = journal_url.split("/journal/")[-1]
        Journal_list.append(journal_number)

    # Sleep for 1 second before making the next request
    time.sleep(1)

# Print the collected journal numbers with an index
for index, number in enumerate(Journal_list, 1):
    print(f"{index}. {number}")


#### Scraping information
Now we scrape the information and genderclasify the aurthors and saving each journal as a CSV file so progress won't get lost 

In [None]:


path_to_save = "path_to_save"



headers = {
    'User-Agent': "Magnus Berg (University of Copenhagen), Accessing OA data for project ascertaining gendered writing styles in academia, email@edurome.ku.dk"
}

# Initialize gender detector
detector = gender.Detector()

def get_first_name(full_name):
    if full_name:
        return full_name.split()[0]
    return "None"

def predict_gender_first_name(name):
    return detector.get_gender(name) if name != "None" else "unknown"

def fetch_soup(session, url):
    response = session.get(url, headers=headers)
    # Check for "Too Many Requests" and pause if encountered
    if response.status_code == 429:
        print("429 Too Many Requests encountered. Pausing for 10 minutes...")
        time.sleep(600)  # Sleep for 10 minutes
        return fetch_soup(session, url)  # Retry after pausing
    time.sleep(1)  # Sleep for 1 second between requests
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')
    return None


MAX_AUTHORS = 10

def process_journal(journal_number):
    data = {
        'Volume': [],
        'Issue': [],
        'Journal Name': [],
        'Published Date': [],
        'Link': [],
        'Title': [],
        'Journal Year': []
    }
    
    for i in range(1, MAX_AUTHORS+1):
        data[f'Author {i}'] = []

    print(f"Scraping data for journal number {journal_number}...")

    with requests.Session() as session:
        base_url = f'https://link.springer.com/journal/{journal_number}/volumes-and-issues/'
        volume = 1

        while True:
            print(f"Scraping volume {volume}...")

            issue = 1
            while True:
                url = base_url + f'{volume}-{issue}'
                soup = fetch_soup(session, url)

                if soup:
                    articles = soup.find_all('li', class_='c-list-group__item')
                    if not articles:
                        break

                    journal_tag = soup.find('div', {'id': 'journalTitle'}).find('a')
                    journal_name = journal_tag.text.strip() if journal_tag else None
                    journal_year = soup.find('h1', class_='u-mb-8')
                    journal_year = journal_year.text.split(",")[-1].strip() if journal_year else None

                    for article in articles:
                        link = article.find('a', href=True)['href']
                        authors = [author.text.strip() for author in article.select('ul.c-author-list li span')]
                        title = article.find('a', attrs={"data-track": "click"}).text.strip()
                        published_date_tag = article.find('li', attrs={'data-test': 'published-on'})
                        published_date = published_date_tag.text.split(': ')[1].strip() if published_date_tag else None

                        data['Volume'].append(volume)
                        data['Issue'].append(issue)
                        data['Journal Name'].append(journal_name)
                        data['Published Date'].append(published_date)
                        data['Link'].append(link)
                        data['Title'].append(title)
                        data['Journal Year'].append(journal_year)

                        # Fill in authors or None values
                        for i in range(MAX_AUTHORS):
                            if i < len(authors):
                                data[f'Author {i+1}'].append(authors[i])
                            else:
                                data[f'Author {i+1}'].append(None)

                else:
                    break

                issue += 1

            time.sleep(2)  # Sleep for 2 seconds after processing each volume
            next_soup = fetch_soup(session, base_url + f'{volume+1}-1')
            if not next_soup or (next_soup and not next_soup.find_all('li', class_='c-list-group__item')):
                break

            volume += 1

    df = pd.DataFrame(data)

    for i in range(MAX_AUTHORS):
        col_name = f'Author {i+1}'
        df[f'Gender_{col_name}'] = df[col_name].apply(get_first_name).apply(predict_gender_first_name)
        df[f'Gender_{col_name}'] = df[f'Gender_{col_name}'].map({
            'male': 'Male',
            'female': 'Female',
            'unknown': 'Unknown',
            'None': 'Unknown'
        })

    return df

# Dictionary to hold DataFrames for each journal
dfs = {}

import os

if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)


# Use ThreadPoolExecutor to scrape multiple journals concurrently with limited threads
MAX_THREADS = 4
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
    futures = {executor.submit(process_journal, journal): journal for journal in Journal_list}
    for future in concurrent.futures.as_completed(futures):
        journal_num = futures[future]
        try:
            result = future.result()
            dfs[journal_num] = pd.DataFrame(result)
            # Save the DataFrame to a CSV file in the specified directory
            dfs[journal_num].to_csv(f"{path_to_save}/journal_{journal_num}.csv", index=False)
            print(f"Saved CSV for journal {journal_num}")
        except Exception as e:
            print(f"Error processing journal {journal_num}: {e}")



# Print out the saved DataFrames
for journal, df in dfs.items():
    print(f"DataFrame for journal {journal}:\n")
    print(df)
    print("\n" + "="*50 + "\n")


#### Saving all to same document and dropping exces aurthors

In [None]:

path_to_save = "path_to_save"
MAX_AUTHORS = 10

# List all CSV files in the directory
all_files = [f for f in os.listdir(path_to_save) if os.path.isfile(os.path.join(path_to_save, f)) and f.endswith('.csv')]

# Read and combine all CSV files
dfs = [pd.read_csv(os.path.join(path_to_save, f)) for f in all_files]
all_data = pd.concat(dfs, ignore_index=True)

# Drop columns where all observations are empty and their corresponding Gender_Author columns
for i in range(1, MAX_AUTHORS + 1):
    author_col = f'Author {i}'
    gender_col = f'Gender_{author_col}'
    if all_data[author_col].isna().all():  # Check if all values in the column are NaN
        all_data.drop(columns=[author_col, gender_col], inplace=True)

# Save the combined dataframe to a CSV file
all_data.to_csv(f"{path_to_save}/all_journals.csv", index=False)
print("Saved all data to all_journals.csv")

# Print out the combined dataframe
print(all_data)
print("\n" + "="*50 + "\n")


Get artical genders based on all aurthor genders

In [None]:
# Function to get the article gender based on author genders
def get_article_gender(row):
    # Extract genders of authors, ignoring 'Unknown'
    genders = [row[f'Gender_Author {i+1}'] for i in range(3) if row[f'Gender_Author {i+1}'] != 'Unknown']
    
    # If there are no known gender values
    if len(genders) == 0:
        return 'Unknown'
    # If all known gender values are 'Male'
    elif all(gender == 'Male' for gender in genders):
        return 'Male'
    # If all known gender values are 'Female'
    elif all(gender == 'Female' for gender in genders):
        return 'Female'
    # If there's a mix of male and female among the known gender values
    else:
        return 'Mix'

# Apply the function to create the "Article_Gender" column
all_data['Article_Gender'] = all_data.apply(get_article_gender, axis=1)

# Display the updated DataFrame
print(all_data)


Now scraping all the intros of all the papers which is clasified as Male or Female 

In [None]:


headers = {
    'User-Agent': "Magnus Berg (University of Copenhagen), Accessing OA data for project ascertaining gendered writing styles in academia, email@edurome.ku.dk"
}
response = requests.get(url, headers=headers)


counter = [0]

def scrape_intro_and_citations(row):
    global counter
    url = row[all_data.columns.tolist().index('Link') + 1]
    
    if not url:  # Check if the URL is None or empty
        return (row[0], None, None)
    
    response = requests.get(url)
    time.sleep(1)  # Sleep for 1 second after each request
    

    
    # Increment the counter and print if it's a multiple of 100
    counter[0] += 1
    if counter[0] % 100 == 0:
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"{timestamp}: {journal_name} {counter[0]}")
    
    if response.status_code == 200:
        webpage = response.text
        soup = BeautifulSoup(webpage, 'html.parser')
        
        # Scrape the introduction
        intro_div = soup.find('div', class_='c-article-section__content', id='Sec1-content')
        intro = ' '.join(p.get_text() for p in intro_div.find_all('p')) if intro_div else None
        
        # Scrape the number of citations
        citations_tag = soup.find('span', string='Citations')
        citations = citations_tag.find_parent('p', class_='c-article-metrics-bar__count').get_text().split()[0] if citations_tag else None
        
        return (row[0], intro, citations)
    elif response.status_code == 429:  # Rate limit error
        time.sleep(300)
        return scrape_intro_and_citations(row)
    return (row[0], None, None)

path_to_save = "path_to_save"

for journal_name in all_data['Journal Name'].unique():
    file_path = f"{path_to_save}/{journal_name}.csv"
    
    # Check if the file already exists
    if os.path.exists(file_path):
        print(f"Journal {journal_name} already processed. Skipping...")
        continue

    counter[0] = 0  # Reset counter for each journal
    print(f"Processing journal: {journal_name}")
    
    # Get all rows for the particular journal
    journal_data = all_data[all_data['Journal Name'] == journal_name].copy()
    
    # Filter only the rows with 'Female' or 'Male' for scraping
    scrape_data = journal_data[journal_data['Article_Gender'].isin(['Female', 'Male'])]

    # Use ThreadPoolExecutor to scrape data concurrently
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(scrape_intro_and_citations, scrape_data.itertuples(index=True, name=None)))

    # Update the journal_data DataFrame directly with the scraped values
    for index, intro, citations in results:
        journal_data.at[index, 'Intro'] = intro
        journal_data.at[index, 'Citations'] = citations

    # Save the entire journal_data DataFrame (including rows with 'Mix' and 'Unknown') to a CSV
    journal_data.to_csv(file_path, index=False)

    print(f"Saved data for journal {journal_name} to {file_path}")


Combine all the journal introes to one document

In [None]:


path_to_save = "path_to_save"

# List to store dataframes
dfs = []

# Iterate over each CSV file in the directory
for filename in os.listdir(path_to_save):
    if filename.endswith(".csv"):
        file_path = os.path.join(path_to_save, filename)
        df = pd.read_csv(file_path)
        dfs.append(df)

# Concatenate all dataframes
total_df = pd.concat(dfs, ignore_index=True)

# Save the total dataframe to a new CSV file
output_file_path = os.path.join(path_to_save, "Alle_Journal_intro.csv")
total_df.to_csv(output_file_path, index=False)

print(f"Saved the concatenated DataFrame to {output_file_path}")


Make a dummy for observatons containing an introduction

In [None]:
# Step 1: Copy the original DataFrame
total_df_check = total_df.copy()

# Step 2: Add a new column to the copied DataFrame
total_df_check['Intro_check'] = total_df['Intro'].notnull().astype(int)

# Step 3: Display the updated DataFrame
print(total_df_check)


### ScienceDirect (Elsevier)
To limit the data quantity transferred between steps, a lot of the preliminary data cleaning occurs concurrently with the scraping process.

In [None]:
url = 'https://www.sciencedirect.com/science/article/pii/S0927537123000404'
user_agent = (
    "Magnus Eldrup (University of Copenhagen), Accessing OA data for project ascertaining gendered writing styles in academia"
)

# Use a non-headless browser with a custom user-agent
options = webdriver.ChromeOptions()
options.add_argument(f"user-agent={user_agent}")
options.add_argument('--headless=new')

#### Index articles

In [None]:
# Journals to scrape
scrape = [('the-journal-of-socio-economics',48),('journal-of-economic-theory',212),('journal-of-international-economics', 145),
('international-economics', 175),('resource-and-energy-economics',74),('economics-letters',231),('economic-modelling', 127),('journal-of-public-economics', 225),('economics-letters',231),('economic-modelling', 127),
('journal-of-econometrics',236),('journal-of-development-economics',165),('journal-of-applied-economics',20)]

n_vol = 45 #No. of volumes to scrape per journal

def index_volumes(journal,vol, n_vol):
    url_base = "https://www.sciencedirect.com/journal/" + str(journal) + "/vol/"
    url_list = []
    for i in range(n_vol): 
        url_pre = url_base + str(vol-i) + "/suppl/C"
        url_list.append(url_pre)
    
    driver = webdriver.Chrome(options=options)

    first_page_url = []
    
    for it, url in enumerate(url_list):
        print(f"Processing URL {it+1}/{len(url_list)}: {url}")
        try:
            driver.get(url)
        
            # Wait for the first link with class "article-content-title" to be clickable
            wait = WebDriverWait(driver, 10)
            link = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.article-content-title')))
        
            # Click the link
            link.click()
        
            # Get the current URL of the page
            first_page_url.append(driver.current_url)
        except Exception as e:
            print(f"An error occurred for URL {url}")
            
    return first_page_url

#### Scrape Journal

In [None]:
def scrape_article(first_page_url):
    journal = []
    for it, url in enumerate(first_page_url):
        print(f"Processing URL {it+1}/{len(first_page_url)}: {url}")

        driver = webdriver.Chrome(options=options)
        driver.get(url)

        i = 0

        page_source = []

        while i < 50:
            time.sleep(5)  # Wait for a longer time between requests

            #print(i)
            page_source.append(driver.page_source)

            try:
                # Wait for the next button to become clickable before clicking it
                wait = WebDriverWait(driver, 10)
                next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.button-alternative-icon-right')))
                next_button.click()

                i += 1
            except Exception as e:
                print("Error with article")
                break
    
        journal.append(page_source)
        

    # Close the browser
    driver.quit()
    return journal

#### Data Treatment

In [None]:
def create_df(page_source):
    df_data = []

    for j in range(len(page_source)):
        authors_dict = {}  # Create a new dictionary for each row
        journal_name = []
        citations = []
        year = []
        introduction = []
        possible_id = ['sec_0001','sec_001','sec_01','sec_1','sec1']
        
        soup = bs(page_source[j], 'html.parser')
        
        """
        for ids in possible_id:
            element_with_id = soup.select_one('section#' + ids)
            if element_with_id:
                break 
            
        if element_with_id:
            introduction = element_with_id.get_text(strip=True)
        else:
            introduction = None  # Handle case when element is not found
            #print('Intro Error')
        """
        # Find the <div> with id="body"
        body_div = soup.find('div', id='body')

        # Find the first <section> within the <div>
        try:
            first_section = body_div.find('section')
        except:
            first_section = None
            
        if first_section:
            introduction = first_section.get_text(strip=True)
        else:
            introduction = None
        
        # Find all <span> elements with class="given-name"
        author_spans = soup.find_all('span', class_='given-name')

        # Loop through the author spans and populate the dictionary
        for i, author_span in enumerate(author_spans):
            try:
                if i < 5:
                    authors_dict[f'Author {i+1}'] = author_span.get_text().split()[0]
                    if "." in authors_dict[f'Author {i+1}']:
                        author_name = "Unknown"
                        authors_dict[f'Author {i+1}'] = author_name
                else:
                    break
            except AttributeError:
                authors_dict[f'Author {i+1}'] = "None"
                #print('error author')
        
        # Fill in remaining entries with "None" for Author 4 and Author 5
        for i in range(len(author_spans), 5):
            authors_dict[f'Author {i+1}'] = "None"
        
        try:
            j_name = soup.find('a', class_="publication-title-link").get_text()
            journal_name = str(j_name)
        except:
            journal_name = "Error no journal name"
            #print('error journal')
            
        try:
            cit = soup.find('header', id="citing-articles-header").get_text().split()[2]
            #cit = cit[-1:][:-1]
            #citations = int(cit)
            cit = ''.join(filter(str.isdigit, cit))
            citations = int(cit)
            #citations = cit
        except:
            citations = None
            #print('error citations')
            
        try:
            yr = soup.find('div', class_="text-xs").get_text()
            year = yr

        except:
            year = None
            #print('error yr')
        
        
        # Append the data for this row to the list of rows
        df_data.append({'Introduction': introduction, **authors_dict, 'Journal': journal_name, 'Year': year, 'Citations': citations})

    df = pd.DataFrame(df_data)

    return df

In [None]:
### Extract the year from the code ###
def extract_year(s):
    if s is None:
        return None
    year_match = re.search(r'(\d{4})', s)
    if year_match:
        return year_match.group(1)
    else:
        return None

In [None]:
def combine_dataframes(list_of_lists):
    combined_df = pd.DataFrame()  # Initialize an empty DataFrame to store the combined data

    for sublist in list_of_lists:
        # Apply create_df function to the sublist and get a dataframe
        df = create_df(sublist)
        
        # Concatenate the current dataframe with the combined dataframe
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    df['Year'] = df['Year'].apply(extract_year)
    #df['Author 4'] = df['Author 4'].astype(str)
    #df['Author 5'] = df['Author 5'].astype(str)
    
    return combined_df

#### Execute Scraping & Export

In [None]:
articles = []

for j_1 in scrape:
    name_of_journal, j_vol = j_1
    print(name_of_journal)
    url_index = index_volumes(name_of_journal , j_vol, n_vol)
    print('no of vols ' + str(len(url_index)))
    
    data = scrape_article(url_index)
    #with concurrent.futures.ThreadPoolExecutor() as executor:
    #    data = executor.map(scrape_article, url_index)
    data_treated = combine_dataframes(data)
    
    #articles.append(data)
    
    # Specify the CSV file name
    csv_file = 'articles_out_' + str(name_of_journal) + '.csv'
    data_treated.to_csv(csv_file, index=False)
    

## Cleaning & Combining Data

#### ScienceDirect
This first piece of code takes all the individual scraped journals from ScienceDirect and assigns genders and exports them. Be aware of the directory from which it pulls the files. It may be necessary to move some files for this process to work correctly.

In [None]:
# Set to your own file directory!
directory = 'scrape_raw'

file_list = []

def index_files():
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            file_list.append(filename)
    print(file_list)
    return file_list


def import_file(name):
    file_path = os.path.join(directory, name)
    df = pd.read_csv(file_path)
    return df


## ASSIGNING GENDER ##

# Initialize the gender detector
detector = gender.Detector()

# Function to predict gender using the genderize.io API
def predict_gender_first_name_api(name):
    try:
        api_url = f'https://api.genderize.io?name={name}'
        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            try:
                gender = data["gender"].capitalize()
                return gender
            except Exception as e:
                print(f"Error predicting gender for {name}: {e}")
                return "Unknown"
        else:
            #print(f"Request failed. Status code: {response.status_code}")
            return "Unknown"
    except:
        return "Unknown"
        
    

# Function to predict gender for the first name
def predict_gender_first_name(name):
    gender = detector.get_gender(name)
    if name == "None":
        return "None"
    elif "." in name:
        return "None"
        print(name)
    elif gender == "unknown":
        #print("api")
        gender_api = predict_gender_first_name_api(name)
        return gender_api
    else:
        if gender in {'male', 'female'}:
            return gender
        else:
            return "Unknown" 

    
# Function to get the article gender based on author genders
def get_article_gender(row):
    genders = [row[f'Gender_Author {i+1}'] for i in range(5) if not pd.isnull(row[f'Gender_Author {i+1}'])]
    if len(genders) == 0:
        return 'Unknown'
    elif all(gender == 'male' for gender in genders):
        return 'Male'
    elif all(gender == 'female' for gender in genders):
        return 'Female'
    else:
        return 'Mix'
    
# Exporting the processed dataframes as .csv files
def export_treated(df):
    # Get the name from the first row of the 'Journal' column
    name = df.loc[0, 'Journal']
    
    # Remove any special characters and spaces from the name to make it suitable for a filename
    name = name.replace(' ', '_').replace('.', '').replace(',', '')

    # Define the directory to save the treated data
    treated_directory = 'treated_data'

    # Create the directory if it doesn't exist
    if not os.path.exists(treated_directory):
        os.makedirs(treated_directory)

    # Export the DataFrame to a CSV file within the treated_data directory
    csv_filename = os.path.join(treated_directory, name + '_treated.csv')
    df.to_csv(csv_filename, index=False)

    print(f"DataFrame exported as {csv_filename}")
    
    return 'Success'


#### Execute gender prediction & export ScienceDirect

In [None]:
# Executes the code above
files = index_files()
Status = []

for i, j_title in enumerate(files):
    print(j_title)
    df = import_file(j_title)
    
    # Apply gender prediction to the first name in each author column and create new gender columns
    for i in range(1, 6):
        author_col = f'Author {i}'
        gender_col = f'Gender_{author_col}'
        df[gender_col] = df[author_col].apply(predict_gender_first_name)
        
    for i in range(1, 6):
        gender_col = f'Gender_Author {i}'
        df[gender_col] = df[gender_col].replace('None', None)
        df[gender_col] = df[gender_col].str.lower()
        
    # Apply the function to create the "Article_Gender" column
    df['Article_Gender'] = df.apply(get_article_gender, axis=1)

    # Display the updated DataFrame
    df
    female_count = len(df[df['Article_Gender'] == 'Female'])
    print(f"Number of rows with 'Article_Gender' == 'Female': {female_count}")
    female_count2 = len(df[df['Gender_Author 1'] == 'female'])
    print(f"Number of rows with first author == 'Female': {female_count2}")
    
    s = export_treated(df)
    Status.append(s)

#### Merge Documents, Science Direct
This following section combines all the treated journals from ScienceDirect into a single .csv file. Once again, it is essential to ensure that the directory of the code matches the directory where your actual treated files (and only your treated files) are stored

In [None]:
# Get the current directory
current_directory = os.getcwd()

# Get a list of all .csv files in the current directory
csv_files = [file for file in os.listdir(current_directory) if file.endswith(".csv")]

# Initialize an empty list to store dataframes
dataframes = []

print(csv_files)

# Read each .csv file and append its content to the list
for csv_file in csv_files:
    file_path = os.path.join(current_directory, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

# Export the combined dataframe to a new .csv file
output_file = "ScienceDirectData.csv"
output_path = os.path.join(current_directory, output_file)
combined_df.to_csv(output_path, index=False)

print(f"Combined dataframe exported to {output_file}")

#### Loade Springer and SD

In [None]:
file_path = "file_path"

# Load the CSV file into a DataFrame
SD = pd.read_csv(file_path, low_memory=False)

SD = SD.dropna(subset=['Introduction'])

# Reset the index
SD = SD.reset_index(drop=True)

print(SD)


In [None]:

file_path = "file_path"

# Load the CSV file into a DataFrame
springer = pd.read_csv(file_path)

springer = springer.dropna(subset=['Intro'])

# Reset the index
springer = springer.reset_index(drop=True)

print(springer)

#### combine data

Rename and merge

In [None]:
SD = SD.rename(columns={"Journal": "Journal Name", "Year" : "Journal Year"})
springer = springer.rename(columns={"Intro": "Introduction"})


columns_df1 = set(SD.columns)
columns_df2 = set(springer.columns)

# Find common column names
common_columns = columns_df1.intersection(columns_df2)

# Find differing column names
different_columns_df1 = columns_df1 - common_columns
different_columns_df2 = columns_df2 - common_columns

# Print results
print("Common columns:", common_columns)
print("Columns in DataFrame 1 only:", different_columns_df1)
print("Columns in DataFrame 2 only:", different_columns_df2)




# Identify columns to keep from DataFrame 1
cols_to_keep_from_df1 = {'Gender_Author 4', 'Author 4', 'Gender_Author 5', 'Author 5'}

# Identify columns to exclude from DataFrame 2
cols_to_exclude_from_df2 = {'Volume', 'Published Date', 'Issue', 'Intro_check', 'Title', 'Link'}

# Filter the dataframes
SD_filtered = SD[list(columns_df1.intersection(columns_df2).union(cols_to_keep_from_df1))]
springer_filtered = springer[list(columns_df2 - cols_to_exclude_from_df2)]

# Fill the non-existing columns in SD with NaN
for col in cols_to_keep_from_df1:
    if col not in SD_filtered.columns:
        SD_filtered[col] = np.nan

# Concatenate the two dataframes vertically
Journal_merged = pd.concat([SD_filtered, springer_filtered], axis=0, ignore_index=True)

Journal_merged = Journal_merged.reset_index(drop=True)

# Save the combined dataframe to a CSV
Journal_merged.to_csv("Journal_merged_path", index=False)


# Use a regular expression to extract the month and year
Journal_merged['Journal Year'] = Journal_merged['Journal Year'].str.extract(r'(\w+ \d{4})')

# Save the modified DataFrame to a CSV
Journal_merged.to_csv("/Journal_merged_path", index=False)




# Filter the rows where Article_Gender is either 'Female' or 'Male'
Journal_merged = Journal_merged[Journal_merged['Article_Gender'].isin(['Female', 'Male'])]

# Create a new column gen_dummy: 1 for Female and 0 for Male
Journal_merged['gen_dummy'] = Journal_merged['Article_Gender'].map({'Female': 1, 'Male': 0})

# Reset the index after filtering
Journal_merged.reset_index(drop=True, inplace=True)

# Save the cleaned DataFrame to a CSV
Journal_merged.to_csv("Journal_merged_path.csv", index=False)



# Specify the desired column order
column_order = [
    'Introduction', 
    'Article_Gender', 
    "gen_dummy",
    'Journal Year', 
    'Journal Name', 
    'Citations', 
    'Author 1', 
    'Gender_Author 1', 
    'Author 2', 
    'Gender_Author 2', 
    'Author 3', 
    'Gender_Author 3', 
    'Author 4', 
    'Gender_Author 4', 
    'Author 5', 
    'Gender_Author 5'
]

# Reorder the columns in the DataFrame
Journal_merged = Journal_merged[column_order]

# Save the rearranged DataFrame to a CSV
Journal_merged.to_csv("Journal_merged_path.csv", index=False)




#### Final cleaning of data

In [None]:
# Step 1: Read the CSV file into a pandas DataFrame
combined_df = pd.read_csv("journal_merged.csv")

# Step 2: Rename the column "Introduction" to "Intro"
combined_df.rename(columns={"Introduction": "Intro"}, inplace=True)

# Step 3: Drop rows where the "Intro" column has NaN values
combined_df.dropna(subset=['Intro'], inplace=True)

# Make a copy of the original dataframe to work on
df_cleaned = combined_df.copy()

# Set up the stop words
stop_words = set(stopwords.words('english'))

# Function to clean the text
def clean_text(text):
    # Remove content between parentheses using regex
    text = re.sub(r'\([^)]*\)', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags/markup
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuations, numbers and other non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub('\s+', ' ', text).strip()
    return text

# Function to remove stop words
def remove_stop_words(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Function for lemmatization
def lemmatize_text(text):
    wnl = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [wnl.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Process tracking counter
counter = 0

# Apply the cleaning function
def process_and_track(func, text):
    global counter
    counter += 1
    if counter % 10000 == 0:
        print(f"Processed {counter} records for cleaning.")
    return func(text)

# Apply the cleaning function
df_cleaned['Intro_Cleaned'] = df_cleaned['Intro'].apply(lambda x: process_and_track(clean_text, x))
# Apply removal of stop words
df_cleaned['Intro_Cleaned'] = df_cleaned['Intro_Cleaned'].apply(lambda x: process_and_track(remove_stop_words, x))
# Apply lemmatization
df_cleaned['Intro_Cleaned'] = df_cleaned['Intro_Cleaned'].apply(lambda x: process_and_track(lemmatize_text, x))

# Save cleaned version
df_cleaned.to_csv('data_cleaned.csv', index=False)

## Analysis

Be aware that most of the analysis of this paper was run on external computational servers and may as such be difficult to run on a personal computer given current specifications. Additionally, the RF and logistic regression only run when importing the final 'data_cleaned.csv'.

### Logistic Regression

### Random Forrest

#### Baseline model

In [None]:
# Load the data
df = pd.read_csv('data_cleaned.csv')

def process_column_rf(column_name, data):
    """Function to process data using RandomForest."""
    data_cleaned = data.dropna(subset=[column_name])
    y = data_cleaned['Article_Gender']
    
    results = {}
    
    # Vectorize using unigrams & bigram
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    X = tfidf_vectorizer.fit_transform(data_cleaned[column_name])
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train RandomForest without cross-validation
    rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', oob_score=True, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    # Get scores for Female
    results['precision_female'] = precision_score(y_test, y_pred, pos_label='Female', zero_division=0)
    results['recall_female'] = recall_score(y_test, y_pred, pos_label='Female', zero_division=0)
    results['f1_score_female'] = f1_score(y_test, y_pred, pos_label='Female', zero_division=0)
    
    # Get scores for Male
    results['precision_male'] = precision_score(y_test, y_pred, pos_label='Male', zero_division=0)
    results['recall_male'] = recall_score(y_test, y_pred, pos_label='Male', zero_division=0)
    results['f1_score_male'] = f1_score(y_test, y_pred, pos_label='Male', zero_division=0)

    # Accuracy score
    results['accuracy_score'] = accuracy_score(y_test, y_pred)

    return results

# Process 'Intro_Cleaned' column without dummies
results_intro_cleaned = process_column_rf('Intro_Cleaned', df)

for metric, score in results_intro_cleaned.items():
    print(f"Results for 'Intro_Cleaned' without dummies using {metric}:")
    print(f"Score: {score}\n")

#### Cross-validation

In [None]:
# Load the data
df = pd.read_csv('data_cleaned.csv')
df['gen_dummy'] = df['gen_dummy'].replace({1: 0, 0: 1})
data_cleaned = df.dropna(subset=['Intro_Cleaned'])
y = data_cleaned['gen_dummy']
x = data_cleaned['Intro_Cleaned']

results = {}

f1_wom_vec = []
acc_vec = []

# Create a custom scorer for F1 score for women (pos=0)
def f1_score_women(y_true, y_pred):
    return f1_score(y_true, y_pred, pos_label=0, zero_division=0)

custom_scorer = make_scorer(f1_score_women)
    
# Vectorize using unigrams & bigram
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(2, 2))
X = tfidf_vectorizer.fit_transform(x)

# Split the data into training and testing sets
X_pre, X_test, y_pre, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

learning_vec = [0.0001]
#[0.2, 0.4, 0.6, 0.8]


rf_params = {
    'n_estimators': [500],
    'max_depth': [30,70,150],
    'min_samples_split': [10,20,40],
    'min_samples_leaf': [4,8,12]
}

class_weights_d = dict()
class_weights_d[0] = 4
class_weights_d[1] = 1

for i, share in enumerate(learning_vec):
    print(share)
    X_train, X_discard, y_train, y_discard = train_test_split(X_pre, y_pre, test_size=share, random_state=42)
    
    grid_search_rf = GridSearchCV(RandomForestClassifier(class_weight=class_weights_d, oob_score=True, random_state=42),
                                      rf_params,
                                      cv=5,
                                      #scoring='f1',
                                      scoring=custom_scorer
                                      )
    grid_search_rf.fit(X_train, y_train)
    best_params_rf = grid_search_rf.best_params_
        
    # Use the best estimator for predictions
    best_rf = grid_search_rf.best_estimator_
    y_pred = best_rf.predict(X_test)
        
    # Calculate scores for female
    precision_female = precision_score(y_test, y_pred, pos_label=0, zero_division=0)
    recall_female = recall_score(y_test, y_pred, pos_label=0, zero_division=0)
    f1_female = f1_score(y_test, y_pred, pos_label=0, zero_division=0)
    
    # Calculate scores for male
    precision_male = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
    recall_male = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
    f1_male = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    f1_wom_vec.append(f1_female)
    acc_vec.append(accuracy)

Printing results from cross-validated bigram RF model

In [None]:
print('Best Scores \n')
print('Accuracy ' + str(accuracy))
print('F1 Female ' + str(f1_female))
print('Recall Female ' + str(recall_female))
print('Precision Female ' + str(precision_female))

#### Learning Curve

In [None]:
best_params_rf

learning_vec = [0.99,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1,0.01]
#[0.2, 0.4, 0.6, 0.8]


rf_params = {
    'n_estimators': [500],
    'max_depth': [150],
    'min_samples_split': [40],
    'min_samples_leaf': [8]
}

class_weights_d = dict()
class_weights_d[0] = 4
class_weights_d[1] = 1

for i, share in enumerate(learning_vec):
    print(share)
    X_train, X_discard, y_train, y_discard = train_test_split(X_pre, y_pre, test_size=share, random_state=42)
    
    # Create a RandomForestClassifier instance with the best parameters
    best_rf = RandomForestClassifier(class_weight=class_weights_d,
                                 oob_score=True,
                                 random_state=42,
                                 **best_params_rf)

    # Fit the model on the training data
    best_rf.fit(X_train, y_train)

    # Use the trained model to make predictions on the test data
    y_pred = best_rf.predict(X_test)    
    # Calculate scores for female
    precision_female = precision_score(y_test, y_pred, pos_label=0, zero_division=0)
    recall_female = recall_score(y_test, y_pred, pos_label=0, zero_division=0)
    f1_female = f1_score(y_test, y_pred, pos_label=0, zero_division=0)
    
    # Calculate scores for male
    precision_male = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
    recall_male = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
    f1_male = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    f1_wom_vec.append(f1_female)
    acc_vec.append(accuracy)

### BERT 

Make a dataset with binary clasification varaible 

In [None]:
# Extract the "Introduction" and "gen_dummy" columns from Journal_merged
df_extracted = Journal_merged[['Introduction', 'gen_dummy']]

# Save the resulting DataFrame back to a new CSV
df_extracted.to_csv('Journal_Introduction_GenDummy.csv', index=False)

print(df_extracted)

Removing HTML for BERT

In [None]:


# Read the CSV file
df = pd.read_csv('PATH/Journal_Introduction_GenDummy.csv')

# Function to clean the text
def clean_text(text):
    if not isinstance(text, str):
        return text

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove the prefix "1. Introduction"
    text = re.sub(r'^1\. Introduction', '', text)

    # Remove excessive whitespaces
    text = ' '.join(text.split())

    return text

# Apply the cleaning function to the 'Introduction' column
df['Introduction'] = df['Introduction'].apply(clean_text)




Load BERT tokenizer

In [None]:
# Load BERT tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Tokenize the text and split to train and test data

In [None]:


# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split your dataframe into training and testing data. You can use sklearn's train_test_split for this
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize your training and testing data
train_encodings = tokenizer.batch_encode_plus(train_df['Introduction'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer.batch_encode_plus(test_df['Introduction'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")





class GenderDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = GenderDataset(train_encodings, train_df['gen_dummy'].tolist())
test_dataset = GenderDataset(test_encodings, test_df['gen_dummy'].tolist())


Running the base model

In [None]:


# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # num_labels=2 for binary classification

# Define training arguments and initialize Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(optim.AdamW(model.parameters(), lr=5e-5), None)  # Using PyTorch's AdamW
)

# Train the model
trainer.train()


Then Hyperparameter tuning with male and female equaly weighted

In [None]:

# Extracting labels and calculating class weights

labels = [entry["labels"] for entry in train_dataset]
class_counts = Counter(labels)

total_samples = sum(class_counts.values())
weights = [total_samples/class_counts.get(i, 1) for i in range(2)]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights).to(device))

# Custom Trainer with overridden compute_loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.logits
        labels = inputs["labels"]
        loss = criterion(logits.view(-1, logits.shape[-1]), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def objective(trial):
    # Hyperparameter space
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16])

    # Initialize BERT model for sequence classification
    model2 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = CustomTrainer(
        model=model2,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        optimizers=(optim.AdamW(model2.parameters(), lr=lr), None)
    )

    try:
        trainer.train()
        results = trainer.evaluate()
        return results["eval_loss"]
    except RuntimeError as e:
        if "out of memory" in str(e):
            print("CUDA out of memory. Trying a smaller batch size...")
            trial.set_user_attr("OOM", True)
            return float('inf')
        else:
            raise e

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

best_params = study.best_params
print(f"Best hyperparameters found: {best_params}")


Runining the model with different weights, using the found hyperparameters from stage before 

In [None]:


# Extracting labels and calculating class weights as before
labels = [entry["labels"] for entry in train_dataset]
class_counts = Counter(labels)
total_samples = sum(class_counts.values())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize BERT model for sequence classification
model_base = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def train_and_evaluate_with_weight(weight_for_positive_class):
    # Set the weights for the classes
    weights = [1.0, weight_for_positive_class]
    criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float).to(device))

    # Adjust logging directory for each weight
    unique_log_dir = f'./logs_best_weight_{weight_for_positive_class}'
    
    training_args_best = TrainingArguments(
        output_dir=f'./results_best_weight_{weight_for_positive_class}', 
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=unique_log_dir,
        logging_steps=100,           # Log and evaluate every 100 steps
        learning_rate=1.1836694153539516e-05,
        evaluation_strategy='steps',  # Evaluate every logging_steps
        eval_steps=100                # Evaluate every 100 steps
    )

    # Custom Trainer with overridden compute_loss
    class CustomTrainerForEvaluation(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            outputs = model(**inputs)
            logits = outputs.logits
            labels = inputs["labels"]
            loss = criterion(logits.view(-1, logits.shape[-1]), labels.view(-1))
            return (loss, outputs) if return_outputs else loss

    def compute_metrics(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        labels = p.label_ids
        accuracy = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds)
        precision = precision_score(labels, preds)
        recall = recall_score(labels, preds)
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    # Instantiate a new model for every run
    model_best = model_base.from_pretrained('bert-base-uncased', num_labels=2)

    trainer_best = CustomTrainerForEvaluation(
        model=model_best,
        args=training_args_best,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate the model
    trainer_best.train()
    results = trainer_best.evaluate()

    # Print metrics for this weight
    for key, value in results.items():
        print(f"Weight {weight_for_positive_class}, {key}: {value}")

    # Clear GPU cache
    torch.cuda.empty_cache()

# Iterate over the desired weights for females only: 1, 3, 5, 7, 9, 11
desired_weights = [1, 3, 5, 7, 9, 11]
for weight in desired_weights:
    train_and_evaluate_with_weight(weight)


## Generating Plots

### Performance Graphs

In [None]:
num_tuning_params = 18
Tuning_param_vec = np.logspace(np.log10(0.01), np.log10(8), num=num_tuning_params)
lambdaa = 1/ Tuning_param_vec

# Apply the fivethirtyeight style to the plot
plt.style.use('seaborn')
# Set the font to "Verdana"
plt.rcParams['font.family'] = 'Verdana'
colors_vec = ['#003f5c', '#bc5090', '#ffa600']

# Load the CSV file into a Pandas DataFrame
file_path = '2gram_performance_scores.csv'
df1 = pd.read_csv(file_path)
# Load the CSV file into a Pandas DataFrame
file_path = '2GRAMPR.csv'
df2 = pd.read_csv(file_path)


# Load the CSV file into a Pandas DataFrame
file_path = 'plot_data.csv'
df3 = pd.read_csv(file_path)
df3

x_11 = df3['Run_Number']
x_11 = x_11[:-1]
x_11 = x_11.append(pd.Series([11]))
y_11 = df3['Best_F1_Score']
y_11 = y_11[:-1]
y_11 = y_11.append(pd.Series([0.356]))
y_12 = df3['Accuracy']
y_12 = y_12[:-1]
y_12 = y_12.append(pd.Series([0.56]))


# Generate fem_weight values using np.linspace
fem_weight = np.linspace(1, 11, 22)

# Create a 1x2 grid of subplots
fig, axs = plt.subplots(1, 2, figsize=(10, 4), facecolor='white')  # Two identical square figures

input_params = lambdaa

label1 = ['Male F1 (Logistic)', 'Female F1 (Logistic)', 'Accuracy (Logistic)']

for i, score_type in enumerate(['Male F1', 'Female F1', 'Accuracy']):
    axs[0].plot(fem_weight, df1[score_type], color=colors_vec[i], label=label1[i])

    
axs[0].plot(x_11, y_11, linestyle='--', linewidth=2.5, alpha=0.89, color=colors_vec[1], label='Female F1 (BERT)')
axs[0].plot(x_11, y_12, linestyle='--', linewidth=2.5, alpha=0.89, color=colors_vec[2], label='Accuracy (BERT)')
    
scatter = axs[1].scatter(df2['Female_Recall'], df2['Female_Precision'], c=input_params, cmap='viridis', norm=LogNorm())

# Plotting for both subplots
for axs_idx in axs:

    axs[0].set_xlabel('Relative Weight, women', fontsize=10)
    axs[0].set_ylabel('F1 Score', fontsize=10)
    axs[1].set_xlabel('Recall', fontsize=10)
    axs[1].set_ylabel('Precision', fontsize=10)
    axs[0].set_title('Performance for Different Weights', fontsize=12.5, pad=19)
    axs[1].set_title('Precision Recall Trade-Off (Women)', fontsize=12.5, pad=19)
    axs_idx.legend()

    # Customize grid style with thicker lines and different linestyle
    axs_idx.grid(True, linestyle='-', linewidth=1.9, alpha=0.99, color='white')

    # Add plot borders
    for spine in axs_idx.spines.values():
        spine.set_visible(False)

    # Adjust tick labels and sizes
    axs_idx.tick_params(axis='both', which='major', labelsize=10)

    # Set a light background color
    axs_idx.set_facecolor('#f2f2f2')
    #axs_idx.set_facecolor('#ffffff')

    # Add a vertical dotted line at x = 5.13 with text
    axs[0].axvline(x=5.13, color='#424242', linestyle='dotted')
    axs[0].text(7.6, 0.95, 'Inverse Sample Weights', fontsize=8, color='#424242', ha='center')

    # Adjust legend style and position
    legend = axs[0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.18), frameon=False, fontsize=7, ncol=3)
    legend.get_frame().set_alpha(0.5)

    # Set x-axis and y-axis limits
    axs[0].set_xlim(0, 12)
    axs[1].set_xlim(0.3, 0.6)
    axs[0].set_ylim(0, 1)
    axs[1].set_ylim(0.25, 0.35)

# Adjust legend style and position
legend = axs[0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.18), frameon=False, fontsize=9, ncol=3)
legend.get_frame().set_alpha(0.5)

# Create a colorbar to show the mapping of values to colors
cbar = plt.colorbar(scatter, )
cbar.set_label('CV Tuning parameter, $\lambda$ ')

# Create a ScalarMappable object from the colormap for the legend
sm = ScalarMappable(cmap='viridis', norm=LogNorm())
sm.set_array([])  # Dummy array

# Set the background color of the figure to white
fig.set_facecolor('white')

# Show the plots
plt.savefig('PR_weight2.jpg', dpi=300, bbox_inches='tight', facecolor='white', format='jpg')
plt.show()

#### Deskriptive analyses plots

In [None]:
df = pd.read_csv('data_cleaned.csv')

# Apply the seaborn style and set font to "Verdana"
plt.style.use('seaborn')
plt.rcParams['font.family'] = 'Verdana'
colors_vec = ['#003f5c','#bc5090','#ffa600']

# Top 20 Journals by Gender Distribution
fig, ax = plt.subplots(figsize=(14, 12))
top_ratios_percentage.sort_values(by='Female', ascending=False).plot(kind='bar', stacked=True, ax=ax, color=colors_vec[:2], alpha=0.75)

# Calculate the average percentage of female authors across all journals and add it as a line
average_female_percentage = df['Article_Gender'].value_counts(normalize=True)['Female'] * 100
ax.axhline(y=average_female_percentage, color=colors_vec[2], linestyle='--', label=f'Average Female % Across All Journals ({average_female_percentage:.2f}%)')

plt.title('Top 20 Journals by Gender Distribution', fontsize=16, pad=19)
plt.ylabel('Percentage (%)', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
ax.grid(True, linestyle='-', linewidth=1.9, alpha=0.99, color='white')
ax.set_facecolor('#f2f2f2')
for spine in ax.spines.values():
    spine.set_visible(False)

# Reordering the legend labels, increasing the font size, and positioning it further below the x-axis labels
handles, labels = ax.get_legend_handles_labels()
order = [1, 0, 2]
ax.legend([handles[idx] for idx in order], [labels[idx] for idx in order], loc='upper center', 
          bbox_to_anchor=(0.5, -0.55), ncol=3, fontsize='large')

plt.tight_layout()
plt.savefig("top_journals_gender_distribution.png", dpi=300)
plt.show()

In [None]:
# Apply the seaborn style and set font to "Verdana"
plt.style.use('seaborn')
plt.rcParams['font.family'] = 'Verdana'
colors_vec = ['#003f5c','#bc5090','#ffa600']

# Calculate overall averages for male and female from the yearly data
avg_female = yearly_gender_ratio['Female'].mean()

# 2. Annual Gender Distribution in Articles
fig, ax = plt.subplots(figsize=(14, 7))
yearly_gender_ratio.plot(kind='bar', stacked=True, ax=ax, color=colors_vec, alpha=0.75)
plt.title('Annual Gender Distribution in Articles', fontsize=16, pad=19)
plt.ylabel('Percentage (%)', fontsize=14)
plt.xlabel('Year', fontsize=14)
plt.xticks(rotation=0, fontsize=12)
plt.grid(True, linestyle='-', linewidth=1.9, alpha=0.99,  color='white')
ax.set_facecolor('#f2f2f2')
for spine in ax.spines.values():
    spine.set_visible(False)

# Add the average line for female
ax.axhline(avg_female, color=colors_vec[2], linestyle='--', label=f'Average Female: {avg_female:.2f}%') 

# Reordering the legend labels and increasing the font size
handles, labels = ax.get_legend_handles_labels()
order = [1, 0, 2]
ax.legend([handles[idx] for idx in order], [labels[idx] for idx in order], loc='upper center', 
          bbox_to_anchor=(0.5, -0.15), ncol=3, fontsize='large')

plt.tight_layout()
plt.savefig("annual_gender_distribution_with_avg_female.png", dpi=300)
plt.show()

In [None]:
# Male data
male_data = {
    'Type': ['Male'] * 10,
    'Feature': ['real world', 'innovation systems', 'marginal cost', 'labor productivity', 'general equilibrium', 'industrial organization', 'united kingdom', 'host country', 'economic theory', 'Game Theory'],
    'Coefficient': [2.036885586402584, 2.0005216149368934, 1.9900478322321766, 1.9709577684657895, 1.8614445247273377, 1.8510086473847436, 1.8300256906013528, 1.7787337070611517, 1.7562447711011193, 1.752180558]
}

# Female data
female_data = {
    'Type': ['Female'] * 10,
    'Feature': ['child care', 'gap literature', 'provides detailed', 'young children', 'socio economic', 'health education', 'time period', 'data collection', 'unintended consequences', 'empirical evidence'],
    'Coefficient': [ -2.1256116253124717, -2.181425890667425, -2.2456961604462324, -2.2818509794621864, -2.304889283248064, -2.329391345506251, -2.4106971165924493, -2.511812252882433, -2.513238243724366, -2.65146320504483]
}

# Journal coefficients
journal_data = {
    'Type': ['Journal'] * 93,  # Adjust this number to match the actual number of journals
    'Feature': [
        'Evolutionary and Institutional Economics Review',
        'Journal of Geographical Systems',
        'Transportation',
        'International Journal of Game Theory',
        'De Economist',
        'International Economics and Economic Policy',
        'Economics Letters',
        'Environmental Economics and Policy Studies',
        'NETNOMICS: Economic Research and Electronic Networking',
        'The Japanese Economic Review',
        # ... Add the rest of the journal names here
        'SERIEs',
        'Review of Agricultural, Food and Environmental Studies',
        'Journal of Cultural Economics',
        'Triple Helix',
        'Journal of Economics, Race, and Policy',
        'IZA Journal of Development and Migration',
        'Indian Economic Review',
        'China Finance and Economic Review',
        'The Indian Journal of Labour Economics',
        'International Advances in Economic Research'
    ],
    'Coefficient': [
        1.1215714798392926,
        1.0558490511098158,
        1.0004738210890898,
        0.9044355292488584,
        0.8892826453881203,
        0.866873388706106,
        0.8495579822106452,
        0.8420989827196874,
        0.8350479119981685,
        0.8168913094406065,
         -0.9900800457162767,
        -1.0180035864757404,
        -1.0266764081142343,
        -1.1807176468137563,
        -1.254879064568282,
        -1.265042814451788,
        -1.370155888419316,
        -1.3935800152507558,
        -1.3951647824296334,
        -1.5794244346661876
    ]
}


# Adjusting the 'Type' length in journal_data to match the actual number of journals
journal_data['Type'] = ['Journal'] * len(journal_data['Feature'])

# Convert the dictionaries to pandas dataframes again
journal_df = pd.DataFrame(journal_data)

# Separate the journals based on coefficients
female_journal_df = journal_df[journal_df['Coefficient'] < 0].copy()
male_journal_df = journal_df[journal_df['Coefficient'] > 0].copy()

# Set 'Type' for these separated journals
female_journal_df['Type'] = ['Female'] * len(female_journal_df)
male_journal_df['Type'] = ['Male'] * len(male_journal_df)

# Concatenate all dataframes
all_data = pd.concat([male_df, female_df, female_journal_df, male_journal_df])

all_data


# Convert the dictionaries to pandas dataframes
male_df = pd.DataFrame(male_data)
female_df = pd.DataFrame(female_data)
journal_df = pd.DataFrame(journal_data)


all_data.to_csv('all_coefficients.csv', index=False)

In [None]:
# Apply the seaborn style and set font to "Verdana"
plt.style.use('seaborn')
plt.rcParams['font.family'] = 'Verdana'
colors_vec = ['#003f5c','#bc5090','#ffa600']

# Convert dictionaries to dataframes
female_df = pd.DataFrame(female_data)
male_df = pd.DataFrame(male_data)

# Sort female data so the largest magnitude (smallest in value) is at the top
female_df = female_df.sort_values(by='Coefficient', ascending=False)

# Sort male data so the largest value is at the top
male_df = male_df.sort_values(by='Coefficient', ascending=True)

# Plotting
fig, axes = plt.subplots(figsize=(10,5), ncols=2, sharey=False)  # Setting sharey to False

# Female features on the left
axes[0].barh(female_df['Feature'], female_df['Coefficient'], align='center', color=colors_vec[0], alpha=0.75, zorder=10, label='Female Features')  
axes[0].set(yticks=range(len(female_df['Feature'])), yticklabels=female_df['Feature'])  
axes[0].set_xlim([-3, 0])

# Male features on the right
axes[1].barh(male_df['Feature'], male_df['Coefficient'], align='center', color=colors_vec[1], alpha=0.75, zorder=10, label='Male Features')  
axes[1].set(yticks=range(len(male_df['Feature'])), yticklabels=male_df['Feature'])  
axes[1].yaxis.tick_right()
axes[1].set_xlim([0, 3])

# Adjusting ticks and labels
for ax in axes:
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set(fontsize=13)

# Add a centralized title for the entire figure closer to the plots
fig.suptitle("Top 10 Coefficient Features", fontsize=10, y=0.92)

# Adjust subplots
plt.subplots_adjust(wspace=0, top=0.85, bottom=0.2, left=0.3, right=0.7)

# Add legends under the graph
fig.legend(loc='lower center', ncol=2, fontsize=12)
plt.savefig("new", dpi=300)
plt.show()

In [None]:
# Adjusting the font size for journal names even smaller

fig, axes = plt.subplots(figsize=(10,5), ncols=2, sharey=False)  # Setting sharey to False

# Female journals on the left
axes[0].barh(female_journal_df['Feature'], female_journal_df['Coefficient'], align='center', color=colors_vec[0], alpha=0.75, zorder=10, label='Female Journals')  
axes[0].set(yticks=range(len(female_journal_df['Feature'])), yticklabels=female_journal_df['Feature'])  
axes[0].set_xlim([-2, 0])

# Male journals on the right
axes[1].barh(male_journal_df['Feature'], male_journal_df['Coefficient'], align='center', color=colors_vec[1], alpha=0.75, zorder=10, label='Male Journals')  
axes[1].set(yticks=range(len(male_journal_df['Feature'])), yticklabels=male_journal_df['Feature'])  
axes[1].yaxis.tick_right()
axes[1].set_xlim([0, 2])

# Adjusting ticks and labels with even smaller font size for y-axis labels
for ax in axes:
    ax.tick_params(axis='y', labelsize=9)  # Reduced font size to 9
    for label in ax.get_xticklabels():
        label.set(fontsize=13)

# Add a centralized title for the entire figure closer to the plots
fig.suptitle("Top 10 Coefficient Journals", fontsize=10, y=0.92)

# Adjust subplots
plt.subplots_adjust(wspace=0, top=0.85, bottom=0.2, left=0.3, right=0.7)

# Add legends under the graph
fig.legend(loc='lower center', ncol=2, fontsize=12)
plt.savefig("new2", dpi=300)
plt.show()

#### Training graphs for BERT with different weights

In [None]:


# Use the seaborn style
plt.style.use('seaborn')
# Set the font to "Verdana" and increase font size
plt.rcParams['font.family'] = 'Verdana'
plt.rcParams['font.size'] = 14
colors = ['#003f5c', '#bc5090', '#ffa600', "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2"]

# Load the dataset
data_df = pd.read_csv('/Users/Magnus/Downloads/output_Final.csv')

# List of metrics
metrics = ["eval/accuracy", "eval/f1", "eval/loss", "eval/precision", "eval/recall"]

# Create line charts with 2 columns and square plots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 20))
axes = axes.ravel()  # Flatten the axes array

legend_handles, legend_labels = [], []

for i, metric in enumerate(metrics):
    metric_data = data_df[data_df['Tag'] == metric]
    
    for color_idx, (run, group) in enumerate(metric_data.groupby('Run')):
        sorted_group = group.sort_values(by='Step')
        
        # Extract run number and rename the run
        run_number = int(run.split('_')[-1])
        run_name = f"Weight {run_number}"
        
        line, = axes[i].plot(sorted_group['Step'], sorted_group['Value'], color=colors[color_idx % len(colors)])
        
        # Gather legend info
        if i == 0:  # Only do this for the first metric to avoid duplicates
            legend_handles.append(line)
            legend_labels.append(run_name)
    
    # Set title, labels, and remove individual legends
    clean_metric = metric.replace("eval/", "")  # Remove "eval/" prefix
    axes[i].set_title(f"{clean_metric.capitalize()} over Steps", fontsize=18, pad=25)
    axes[i].set_xlabel("Steps", fontsize=16)
    axes[i].set_ylabel(clean_metric.capitalize(), fontsize=16)
    axes[i].grid(True, linestyle='-', linewidth=1.9, alpha=0.99, color='white')
    axes[i].tick_params(axis='both', which='major', labelsize=14)
    for spine in axes[i].spines.values():
        spine.set_visible(False)

# Remove the last unused subplot
axes[-1].axis('off')

# Create one unified legend for the figure at the top, but slightly lower than before
fig.legend(legend_handles, legend_labels, loc='upper center', ncol=len(legend_handles), bbox_to_anchor=(0.5, 1.05), frameon=False, fontsize=16)

plt.tight_layout()
plt.show()
