In [1]:
from bs4 import BeautifulSoup
import boto3
import config as cfg
import datetime
import io
import gender_guesser.detector as gender
from nameparser import HumanName
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import re
import spacy
import time
import yaml

## Objective:
* Implement a version 1 approach to infer the gender of a person featured in a Wikipedia article using the person's name.
  * Limitations: The Python package leveraged uses first names to infer gender, but of course we should not make assumptions about gender based on first names. This method is only intended as an initial, fast attempt to gather this data. In a future release, I will extract data on pronouns from Wikipedia articles and use that information to infer gender (assuming the Wikipedia page authors accurately captured the subjects' pronouns).

In [2]:
%run "../libraries/aws_utils.ipynb"

In [3]:
%run "../libraries/general_utils.ipynb"

## Setup

In [4]:
with open('config.yml', 'r') as file:
   config_files = yaml.safe_load(file)

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
s3_reader = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_reader['accessCode'],
                    aws_secret_access_key=cfg.aws_reader['secretCode'])

In [7]:
afd_metadata = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['AFD_NAMES_AND_DISCUSSION'])
test_primary_key(afd_metadata, ['file_name','entity'])

In [8]:
people_to_process = afd_metadata[(afd_metadata['found_person']) & afd_metadata['num_entities']==1] # filter to people
initial_people_count = people_to_process.shape[0]

In [9]:
people_to_process[0:3]

Unnamed: 0,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result
0,Margaret Louise Skourlis,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete
2,Michael D. Mehta,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete
6,Sangsadia Nirbachan 1991,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",merge


## Extract first and last name

In [10]:
def extract_first_and_last_name(name_string):
  '''
  Detects the first and last names of a given name string.

  Args:
      name_string (str): A string containing a full name string.

  Returns:
      str or None: None if the name is not recognized as a person.

  Example:
      >>> extract_first_and_last_name("Jane Doe")
      'jane' 'doe'
  '''
  doc = nlp(name_string)
  if any(entity.label_=="PERSON" for entity in doc.ents):
    name = HumanName(str(doc.ents[0]))
    return name.first, name.last
  else:
    return None

In [11]:
people_to_process[['first_name', 'last_name']] = people_to_process['entity'].apply(lambda x: 
                                              pd.Series( extract_first_and_last_name(x), 
                                                        index=['first_name', 'last_name']) )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [12]:
people_to_process[0:3]

Unnamed: 0,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result,first_name,last_name
0,Margaret Louise Skourlis,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,Margaret,Skourlis
2,Michael D. Mehta,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,Michael,Mehta
6,Sangsadia Nirbachan 1991,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",merge,Sangsadia,Nirbachan


## Run gender detector on the first name only

In [13]:
def gender_detector(first_name):
  '''
  Detects the gender of a given name string.

  Args:
      first_name (str): A string containing a first name.

  Returns:
      str or None: The gender of the given name, or None if the name is not recognized.

  Example:
      >>> gender_detector("John")
      'male'
  '''
  detector = gender.Detector()
  
  try:
    return detector.get_gender(first_name) 
  except:
    return None

In [14]:
start_time = time.time()
# this script is slow, so we will only run it on unique first names
unique_first_names =  pd.DataFrame({"first_name": list(set(people_to_process['first_name']))})
unique_first_names['inferred_gender'] = unique_first_names['first_name'].apply(lambda x: gender_detector(x))
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)

Elapsed time:  254.07040309906006


In [15]:
# join inferred gender onto dataframe with names of all Wikipedia subjects
people_to_process = people_to_process.merge(unique_first_names,
                                           on = ['first_name'],
                                           how="left")

In [16]:
people_to_process['inferred_gender'].value_counts()

male             431
unknown          285
female           150
mostly_male       27
mostly_female     18
andy              18
Name: inferred_gender, dtype: int64

In [17]:
people_to_process['inferred_gender'].value_counts() / people_to_process.shape[0]

male             0.463940
unknown          0.306781
female           0.161464
mostly_male      0.029064
mostly_female    0.019376
andy             0.019376
Name: inferred_gender, dtype: float64

In [18]:
people_to_process[0:3]

Unnamed: 0,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result,first_name,last_name,inferred_gender
0,Margaret Louise Skourlis,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,Margaret,Skourlis,female
1,Michael D. Mehta,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,Michael,Mehta,male
2,Sangsadia Nirbachan 1991,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",merge,Sangsadia,Nirbachan,unknown


## Write results

In [19]:
s3_writer = boto3.client('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_writer['accessCode'],
                    aws_secret_access_key=cfg.aws_writer['secretCode'])

In [20]:
out_buffer = io.BytesIO()
people_to_process.to_parquet(out_buffer, index=False)
s3_writer.put_object( Bucket=config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                     Key=config_files['INFERRED_GENDER'], 
                     Body=out_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'NNFH2JPGCEYQ1KDG',
  'HostId': 'Hq7Oiqd1sV7OEq4Tduaetd7OIc4HEzXyek/Rq4s02aS3akIvEv4DydWkc37GkL+XGcxr4URai0Y=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Hq7Oiqd1sV7OEq4Tduaetd7OIc4HEzXyek/Rq4s02aS3akIvEv4DydWkc37GkL+XGcxr4URai0Y=',
   'x-amz-request-id': 'NNFH2JPGCEYQ1KDG',
   'date': 'Sat, 03 Jun 2023 13:20:15 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"adc8d14d0f31da4129d6609afb8646ec"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"adc8d14d0f31da4129d6609afb8646ec"',
 'ServerSideEncryption': 'AES256'}