In [46]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import gender_guesser.detector as gender

base_url = 'https://link.springer.com/journal/10887/volumes-and-issues/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Create lists to store the scraped data
volumes = []
issues = []
links = []
authors_list = []
titles = []

max_authors = 0  # Variable to store the maximum number of authors across all articles

# Initialize gender detector
detector = gender.Detector()

# Function to get the first name from a full name
def get_first_name(full_name):
    if full_name is not None:
        return full_name.split()[0]
    else:
        return "None"

# Function to predict gender for the first name
def predict_gender_first_name(name):
    if name == "None":
        return "Unknown"
    return detector.get_gender(name)

for volume in range(1, 2):  # Volumes range from 1 to 28
    for issue in range(1, 5):  # Issues range from 1 to 4
        url = base_url + f'{volume}-{issue}'

        response = requests.get(url, headers=headers)

        # If the GET request is successful, the status code will be 200
        if response.status_code == 200:
            # Get the content of the response
            webpage = response.text

            # Create a BeautifulSoup object and specify the parser
            soup = BeautifulSoup(webpage, 'html.parser')

            # Find all parent li tags for each article
            articles = soup.find_all('li', class_='c-list-group__item')

            # For each article, extract the link, author names, and title
            for article in articles:
                # Get the link of the article
                link = article.find('a', href=True)['href']

                # Get the author names
                authors_list_temp = article.select('ul.c-author-list li span')  # Use a temporary variable
                authors = [author.text.strip() for author in authors_list_temp]

                # Get the title
                title = article.find('a', attrs={"data-track": "click"}).text.strip()

                # Append the data to the lists
                volumes.append(volume)
                issues.append(issue)
                links.append(link)
                authors_list.append(authors)  # Store the list of authors directly without joining them
                titles.append(title)

                # Update the maximum number of authors if needed
                max_authors = max(max_authors, len(authors))

                # Sleep for 1 second before the next request
                time.sleep(1)

        else:
            print(f'Request to {url} failed with status code {response.status_code}.')

# Prepare the DataFrame with appropriate columns
columns = ['Volume', 'Issue', 'Link', 'Title'] + [f'Author {i+1}' for i in range(max_authors)]
data = {
    'Volume': volumes,
    'Issue': issues,
    'Link': links,
    'Title': titles
}

for i in range(max_authors):
    data[f'Author {i+1}'] = [authors[i] if len(authors) > i else None for authors in authors_list]

df = pd.DataFrame(data)

# Apply gender prediction to the first name in each author column and create new gender columns
for i in range(max_authors):
    col_name = f'Author {i+1}'
    df[f'Gender_{col_name}'] = df[col_name].apply(get_first_name).apply(predict_gender_first_name)

# Map the gender predictions to more readable form
for i in range(max_authors):
    col_name = f'Gender_Author {i+1}'
    df[col_name] = df[col_name].map({
        'male': 'Male',
        'female': 'Female',
        'unknown': 'Unknown',
        'None': 'Unknown'  # Handle cases where there's no author
    })

# Display the DataFrame
print(df)


    Volume  Issue                                               Link  \
0        1      1  https://link.springer.com/article/10.1007/BF00...   
1        1      1  https://link.springer.com/article/10.1007/BF00...   
2        1      1  https://link.springer.com/article/10.1007/BF00...   
3        1      1  https://link.springer.com/article/10.1007/BF00...   
4        1      1  https://link.springer.com/article/10.1007/BF00...   
5        1      1  https://link.springer.com/article/10.1007/BF00...   
6        1      2  https://link.springer.com/article/10.1007/BF00...   
7        1      2  https://link.springer.com/article/10.1007/BF00...   
8        1      2  https://link.springer.com/article/10.1007/BF00...   
9        1      2  https://link.springer.com/article/10.1007/BF00...   
10       1      2  https://link.springer.com/article/10.1007/BF00...   
11       1      2  https://link.springer.com/article/10.1007/BF00...   
12       1      3  https://link.springer.com/article/10.1007/BF0

In [48]:
import pandas as pd
import gender_guesser.detector as gender

detector = gender.Detector()

def predict_gender_first_name_api(name):
    api_url = 'https://api.genderize.io?name=' + name
    print(api_url)
    response = requests.get(api_url)
    # Check if the request was successful (200 OK)
    if response.status_code == 200:
        data = response.json()  # Parse the response content as JSON
        try:
            gender = data["gender"].capitalize()
            probability = data["probability"]
            if data["gender"] in {'male', 'female'}:
                return data["gender"]
            else:
                return "Unknown"
            
        except Exception as e:
            print(f"Error predicting gender for {name}: {e}")
            return "Unknown"
    else:
        print(f"Request failed. Status code: {response.status_code}")
        return "Unknown"


# Function to get the first name from a full name
def get_first_name(full_name):
    if full_name is not None:
        return full_name.split()[0]
    else:
        return "None"

# Function to predict gender for the first name
def predict_gender_first_name(name):
    gender = detector.get_gender(name)
    if name == "None":
        return "None"
    elif gender == "unknown":
        gender_api = predict_gender_first_name_api(name)
        return gender_api
    else:
        return gender

# Apply gender prediction to the first name in each author column and create new gender columns
df['Gender_Author 1'] = df['Author 1'].apply(get_first_name).apply(predict_gender_first_name)
df['Gender_Author 2'] = df['Author 2'].apply(get_first_name).apply(predict_gender_first_name)
df['Gender_Author 3'] = df['Author 3'].apply(get_first_name).apply(predict_gender_first_name)

#df.drop(columns=['Gender_Author 1', 'Gender_Author 2', 'Gender_Author 3'], inplace=True)

# Map the gender predictions to more readable form
df['Gender_Author 1'] = df['Gender_Author 1'].map({
    'male': 'Male',
    'female': 'Female',
    'unknown': 'Unknown',
})
df['Gender_Author 2'] = df['Gender_Author 2'].map({
    'male': 'Male',
    'female': 'Female',
    'unknown': 'Unknown',
})
df['Gender_Author 3'] = df['Gender_Author 3'].map({
    'male': 'Male',
    'female': 'Female',
    'unknown': 'Unknown',
})

print(df)

https://api.genderize.io?name=Jong-Wha
Error predicting gender for Jong-Wha: 'NoneType' object has no attribute 'capitalize'
https://api.genderize.io?name=Chien-Fu
https://api.genderize.io?name=Sule
https://api.genderize.io?name=Minseong
https://api.genderize.io?name=Mancur
    Volume  Issue                                               Link  \
0        1      1  https://link.springer.com/article/10.1007/BF00...   
1        1      1  https://link.springer.com/article/10.1007/BF00...   
2        1      1  https://link.springer.com/article/10.1007/BF00...   
3        1      1  https://link.springer.com/article/10.1007/BF00...   
4        1      1  https://link.springer.com/article/10.1007/BF00...   
5        1      1  https://link.springer.com/article/10.1007/BF00...   
6        1      2  https://link.springer.com/article/10.1007/BF00...   
7        1      2  https://link.springer.com/article/10.1007/BF00...   
8        1      2  https://link.springer.com/article/10.1007/BF00...   
9    

In [50]:
import os
import pandas as pd

# Assuming 'df' is your DataFrame
# For example:
# df = pd.DataFrame({'Column1': [1, 2, 3], 'Column2': ['A', 'B', 'C']})

# Get the current working directory
current_directory = os.getcwd()

# Define the file name you want to save
file_name = 'Springer_dataframe.csv'

# Join the current directory with the file name
file_path = os.path.join(current_directory, file_name)

# Save the DataFrame to CSV
df.to_csv(file_path, index=False)

print(f"DataFrame has been saved to '{file_path}'.")


DataFrame has been saved to '/Users/Magnus/Documents/GitHub/SkoderIDinDrink/Exam_project/Springer_dataframe.csv'.
